1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2004-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucase.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2004aug30
16 *   created by: Markus W. Scherer
17 *
18 *   Low-level Unicode character/string case mapping code.
19 *   Much code moved here (and modified) from uchar.c.
20 */
21 
22 #include "unicode/utypes.h"
23 #include "unicode/unistr.h"
24 #include "unicode/uset.h"
25 #include "unicode/udata.h" /* UDataInfo */
26 #include "unicode/utf16.h"
27 #include "ucmndata.h" /* DataHeader */
28 #include "udatamem.h"
29 #include "umutex.h"
30 #include "uassert.h"
31 #include "cmemory.h"
32 #include "utrie2.h"
33 #include "ucase.h"
34 
35 struct UCaseProps {
36     UDataMemory *mem;
37     const int32_t *indexes;
38     const uint16_t *exceptions;
39     const uint16_t *unfold;
40 
41     UTrie2 trie;
42     uint8_t formatVersion[4];
43 };
44 
45 /* ucase_props_data.h is machine-generated by gencase --csource */
46 #define INCLUDED_FROM_UCASE_CPP
47 #include "ucase_props_data.h"
48 
49 /* set of property starts for UnicodeSet ------------------------------------ */
50 
51 static UBool U_CALLCONV
_enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)52 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
53     /* add the start code point to the USet */
54     const USetAdder *sa=(const USetAdder *)context;
55     sa->add(sa->set, start);
56     return TRUE;
57 }
58 
59 U_CFUNC void U_EXPORT2
ucase_addPropertyStarts(const USetAdder * sa,UErrorCode * pErrorCode)60 ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
61     if(U_FAILURE(*pErrorCode)) {
62         return;
63     }
64 
65     /* add the start code point of each same-value range of the trie */
66     utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
67 
68     /* add code points with hardcoded properties, plus the ones following them */
69 
70     /* (none right now, see comment below) */
71 
72     /*
73      * Omit code points with hardcoded specialcasing properties
74      * because we do not build property UnicodeSets for them right now.
75      */
76 }
77 
78 /* data access primitives --------------------------------------------------- */
79 
80 U_CFUNC const UTrie2 * U_EXPORT2
ucase_getTrie()81 ucase_getTrie() {
82     return &ucase_props_singleton.trie;
83 }
84 
85 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
86 
87 /* number of bits in an 8-bit integer value */
88 static const uint8_t flagsOffset[256]={
89     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
90     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
91     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
98     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
100     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
101     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
104     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
105 };
106 
107 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
108 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
109 
110 /*
111  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
112  *
113  * @param excWord (in) initial exceptions word
114  * @param idx (in) desired slot index
115  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
116  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
117  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
118  */
119 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
120     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
121         (pExc16)+=SLOT_OFFSET(excWord, idx); \
122         (value)=*pExc16; \
123     } else { \
124         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
125         (value)=*pExc16++; \
126         (value)=((value)<<16)|*pExc16; \
127     }
128 
129 /* simple case mappings ----------------------------------------------------- */
130 
131 U_CAPI UChar32 U_EXPORT2
ucase_tolower(UChar32 c)132 ucase_tolower(UChar32 c) {
133     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
134     if(!UCASE_HAS_EXCEPTION(props)) {
135         if(UCASE_IS_UPPER_OR_TITLE(props)) {
136             c+=UCASE_GET_DELTA(props);
137         }
138     } else {
139         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
140         uint16_t excWord=*pe++;
141         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
142             int32_t delta;
143             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
144             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
145         }
146         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
147             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
148         }
149     }
150     return c;
151 }
152 
153 U_CAPI UChar32 U_EXPORT2
ucase_toupper(UChar32 c)154 ucase_toupper(UChar32 c) {
155     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
156     if(!UCASE_HAS_EXCEPTION(props)) {
157         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
158             c+=UCASE_GET_DELTA(props);
159         }
160     } else {
161         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
162         uint16_t excWord=*pe++;
163         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
164             int32_t delta;
165             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
166             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
167         }
168         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
169             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
170         }
171     }
172     return c;
173 }
174 
175 U_CAPI UChar32 U_EXPORT2
ucase_totitle(UChar32 c)176 ucase_totitle(UChar32 c) {
177     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
178     if(!UCASE_HAS_EXCEPTION(props)) {
179         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
180             c+=UCASE_GET_DELTA(props);
181         }
182     } else {
183         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
184         uint16_t excWord=*pe++;
185         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
186             int32_t delta;
187             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
188             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
189         }
190         int32_t idx;
191         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
192             idx=UCASE_EXC_TITLE;
193         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
194             idx=UCASE_EXC_UPPER;
195         } else {
196             return c;
197         }
198         GET_SLOT_VALUE(excWord, idx, pe, c);
199     }
200     return c;
201 }
202 
203 static const UChar iDot[2] = { 0x69, 0x307 };
204 static const UChar jDot[2] = { 0x6a, 0x307 };
205 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
206 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
207 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
208 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
209 
210 
211 U_CFUNC void U_EXPORT2
ucase_addCaseClosure(UChar32 c,const USetAdder * sa)212 ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
213     uint16_t props;
214 
215     /*
216      * Hardcode the case closure of i and its relatives and ignore the
217      * data file data for these characters.
218      * The Turkic dotless i and dotted I with their case mapping conditions
219      * and case folding option make the related characters behave specially.
220      * This code matches their closure behavior to their case folding behavior.
221      */
222 
223     switch(c) {
224     case 0x49:
225         /* regular i and I are in one equivalence class */
226         sa->add(sa->set, 0x69);
227         return;
228     case 0x69:
229         sa->add(sa->set, 0x49);
230         return;
231     case 0x130:
232         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
233         sa->addString(sa->set, iDot, 2);
234         return;
235     case 0x131:
236         /* dotless i is in a class by itself */
237         return;
238     default:
239         /* otherwise use the data file data */
240         break;
241     }
242 
243     props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
244     if(!UCASE_HAS_EXCEPTION(props)) {
245         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
246             /* add the one simple case mapping, no matter what type it is */
247             int32_t delta=UCASE_GET_DELTA(props);
248             if(delta!=0) {
249                 sa->add(sa->set, c+delta);
250             }
251         }
252     } else {
253         /*
254          * c has exceptions, so there may be multiple simple and/or
255          * full case mappings. Add them all.
256          */
257         const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
258         const UChar *closure;
259         uint16_t excWord=*pe++;
260         int32_t idx, closureLength, fullLength, length;
261 
262         pe0=pe;
263 
264         /* add all simple case mappings */
265         for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
266             if(HAS_SLOT(excWord, idx)) {
267                 pe=pe0;
268                 GET_SLOT_VALUE(excWord, idx, pe, c);
269                 sa->add(sa->set, c);
270             }
271         }
272         if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
273             pe=pe0;
274             int32_t delta;
275             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
276             sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
277         }
278 
279         /* get the closure string pointer & length */
280         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
281             pe=pe0;
282             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
283             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
284             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
285         } else {
286             closureLength=0;
287             closure=NULL;
288         }
289 
290         /* add the full case folding */
291         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
292             pe=pe0;
293             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
294 
295             /* start of full case mapping strings */
296             ++pe;
297 
298             fullLength&=0xffff; /* bits 16 and higher are reserved */
299 
300             /* skip the lowercase result string */
301             pe+=fullLength&UCASE_FULL_LOWER;
302             fullLength>>=4;
303 
304             /* add the full case folding string */
305             length=fullLength&0xf;
306             if(length!=0) {
307                 sa->addString(sa->set, (const UChar *)pe, length);
308                 pe+=length;
309             }
310 
311             /* skip the uppercase and titlecase strings */
312             fullLength>>=4;
313             pe+=fullLength&0xf;
314             fullLength>>=4;
315             pe+=fullLength;
316 
317             closure=(const UChar *)pe; /* behind full case mappings */
318         }
319 
320         /* add each code point in the closure string */
321         for(idx=0; idx<closureLength;) {
322             U16_NEXT_UNSAFE(closure, idx, c);
323             sa->add(sa->set, c);
324         }
325     }
326 }
327 
328 /*
329  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
330  * must be length>0 and max>0 and length<=max
331  */
332 static inline int32_t
strcmpMax(const UChar * s,int32_t length,const UChar * t,int32_t max)333 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
334     int32_t c1, c2;
335 
336     max-=length; /* we require length<=max, so no need to decrement max in the loop */
337     do {
338         c1=*s++;
339         c2=*t++;
340         if(c2==0) {
341             return 1; /* reached the end of t but not of s */
342         }
343         c1-=c2;
344         if(c1!=0) {
345             return c1; /* return difference result */
346         }
347     } while(--length>0);
348     /* ends with length==0 */
349 
350     if(max==0 || *t==0) {
351         return 0; /* equal to length of both strings */
352     } else {
353         return -max; /* return lengh difference */
354     }
355 }
356 
357 U_CFUNC UBool U_EXPORT2
ucase_addStringCaseClosure(const UChar * s,int32_t length,const USetAdder * sa)358 ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) {
359     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
360 
361     if(ucase_props_singleton.unfold==NULL || s==NULL) {
362         return FALSE; /* no reverse case folding data, or no string */
363     }
364     if(length<=1) {
365         /* the string is too short to find any match */
366         /*
367          * more precise would be:
368          * if(!u_strHasMoreChar32Than(s, length, 1))
369          * but this does not make much practical difference because
370          * a single supplementary code point would just not be found
371          */
372         return FALSE;
373     }
374 
375     const uint16_t *unfold=ucase_props_singleton.unfold;
376     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
377     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
378     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
379     unfold+=unfoldRowWidth;
380 
381     if(length>unfoldStringWidth) {
382         /* the string is too long to find any match */
383         return FALSE;
384     }
385 
386     /* do a binary search for the string */
387     start=0;
388     limit=unfoldRows;
389     while(start<limit) {
390         i=(start+limit)/2;
391         const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
392         result=strcmpMax(s, length, p, unfoldStringWidth);
393 
394         if(result==0) {
395             /* found the string: add each code point, and its case closure */
396             UChar32 c;
397 
398             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
399                 U16_NEXT_UNSAFE(p, i, c);
400                 sa->add(sa->set, c);
401                 ucase_addCaseClosure(c, sa);
402             }
403             return TRUE;
404         } else if(result<0) {
405             limit=i;
406         } else /* result>0 */ {
407             start=i+1;
408         }
409     }
410 
411     return FALSE; /* string not found */
412 }
413 
414 U_NAMESPACE_BEGIN
415 
FullCaseFoldingIterator()416 FullCaseFoldingIterator::FullCaseFoldingIterator()
417         : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
418           unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
419           unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
420           unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
421           currentRow(0),
422           rowCpIndex(unfoldStringWidth) {
423     unfold+=unfoldRowWidth;
424 }
425 
426 UChar32
next(UnicodeString & full)427 FullCaseFoldingIterator::next(UnicodeString &full) {
428     // Advance past the last-delivered code point.
429     const UChar *p=unfold+(currentRow*unfoldRowWidth);
430     if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
431         ++currentRow;
432         p+=unfoldRowWidth;
433         rowCpIndex=unfoldStringWidth;
434     }
435     if(currentRow>=unfoldRows) { return U_SENTINEL; }
436     // Set "full" to the NUL-terminated string in the first unfold column.
437     int32_t length=unfoldStringWidth;
438     while(length>0 && p[length-1]==0) { --length; }
439     full.setTo(FALSE, p, length);
440     // Return the code point.
441     UChar32 c;
442     U16_NEXT_UNSAFE(p, rowCpIndex, c);
443     return c;
444 }
445 
446 namespace LatinCase {
447 
448 const int8_t TO_LOWER_NORMAL[LIMIT] = {
449     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
450     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
451     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
452     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453 
454     0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
455     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
456     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
457     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458 
459     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
460     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
461     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
462     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
463 
464     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
465     32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
466     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
467     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
468 
469     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
470     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
471     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
472     EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
473 
474     0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
475     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
476     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
477     1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
478 };
479 
480 const int8_t TO_LOWER_TR_LT[LIMIT] = {
481     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
482     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
483     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
485 
486     0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
487     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
488     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
489     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490 
491     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
492     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495 
496     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
497     32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
498     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
499     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
500 
501     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
502     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
503     1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
504     EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
505 
506     0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
507     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
508     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
509     1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
510 };
511 
512 const int8_t TO_UPPER_NORMAL[LIMIT] = {
513     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
514     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
515     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
516     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
517 
518     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
519     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
520     0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
521     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
522 
523     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
524     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
525     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
526     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
527 
528     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
529     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
530     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
531     -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
532 
533     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
534     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
535     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
536     0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
537 
538     -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
539     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
540     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
541     0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
542 };
543 
544 const int8_t TO_UPPER_TR[LIMIT] = {
545     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
546     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
548     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549 
550     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552     0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
553     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
554 
555     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
556     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
557     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559 
560     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
561     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
562     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
563     -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
564 
565     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
566     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
567     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
568     0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
569 
570     -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
571     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
572     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
573     0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
574 };
575 
576 }  // namespace LatinCase
577 
578 U_NAMESPACE_END
579 
580 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
581 U_CAPI int32_t U_EXPORT2
ucase_getType(UChar32 c)582 ucase_getType(UChar32 c) {
583     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
584     return UCASE_GET_TYPE(props);
585 }
586 
587 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
588 U_CAPI int32_t U_EXPORT2
ucase_getTypeOrIgnorable(UChar32 c)589 ucase_getTypeOrIgnorable(UChar32 c) {
590     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
591     return UCASE_GET_TYPE_AND_IGNORABLE(props);
592 }
593 
594 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
595 static inline int32_t
getDotType(UChar32 c)596 getDotType(UChar32 c) {
597     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
598     if(!UCASE_HAS_EXCEPTION(props)) {
599         return props&UCASE_DOT_MASK;
600     } else {
601         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
602         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
603     }
604 }
605 
606 U_CAPI UBool U_EXPORT2
ucase_isSoftDotted(UChar32 c)607 ucase_isSoftDotted(UChar32 c) {
608     return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
609 }
610 
611 U_CAPI UBool U_EXPORT2
ucase_isCaseSensitive(UChar32 c)612 ucase_isCaseSensitive(UChar32 c) {
613     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
614     if(!UCASE_HAS_EXCEPTION(props)) {
615         return (UBool)((props&UCASE_SENSITIVE)!=0);
616     } else {
617         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
618         return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
619     }
620 }
621 
622 /* string casing ------------------------------------------------------------ */
623 
624 /*
625  * These internal functions form the core of string case mappings.
626  * They map single code points to result code points or strings and take
627  * all necessary conditions (context, locale ID, options) into account.
628  *
629  * They do not iterate over the source or write to the destination
630  * so that the same functions are useful for non-standard string storage,
631  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
632  * For the same reason, the "surrounding text" context is passed in as a
633  * UCaseContextIterator which does not make any assumptions about
634  * the underlying storage.
635  *
636  * This section contains helper functions that check for conditions
637  * in the input text surrounding the current code point
638  * according to SpecialCasing.txt.
639  *
640  * Each helper function gets the index
641  * - after the current code point if it looks at following text
642  * - before the current code point if it looks at preceding text
643  *
644  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
645  *
646  * Final_Sigma
647  *   C is preceded by a sequence consisting of
648  *     a cased letter and a case-ignorable sequence,
649  *   and C is not followed by a sequence consisting of
650  *     an ignorable sequence and then a cased letter.
651  *
652  * More_Above
653  *   C is followed by one or more characters of combining class 230 (ABOVE)
654  *   in the combining character sequence.
655  *
656  * After_Soft_Dotted
657  *   The last preceding character with combining class of zero before C
658  *   was Soft_Dotted,
659  *   and there is no intervening combining character class 230 (ABOVE).
660  *
661  * Before_Dot
662  *   C is followed by combining dot above (U+0307).
663  *   Any sequence of characters with a combining class that is neither 0 nor 230
664  *   may intervene between the current character and the combining dot above.
665  *
666  * The erratum from 2002-10-31 adds the condition
667  *
668  * After_I
669  *   The last preceding base character was an uppercase I, and there is no
670  *   intervening combining character class 230 (ABOVE).
671  *
672  *   (See Jitterbug 2344 and the comments on After_I below.)
673  *
674  * Helper definitions in Unicode 3.2 UAX 21:
675  *
676  * D1. A character C is defined to be cased
677  *     if it meets any of the following criteria:
678  *
679  *   - The general category of C is Titlecase Letter (Lt)
680  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
681  *   - Given D = NFD(C), then it is not the case that:
682  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
683  *     (This third criterium does not add any characters to the list
684  *      for Unicode 3.2. Ignored.)
685  *
686  * D2. A character C is defined to be case-ignorable
687  *     if it meets either of the following criteria:
688  *
689  *   - The general category of C is
690  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
691  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
692  *   - C is one of the following characters
693  *     U+0027 APOSTROPHE
694  *     U+00AD SOFT HYPHEN (SHY)
695  *     U+2019 RIGHT SINGLE QUOTATION MARK
696  *            (the preferred character for apostrophe)
697  *
698  * D3. A case-ignorable sequence is a sequence of
699  *     zero or more case-ignorable characters.
700  */
701 
702 #define is_d(c) ((c)=='d' || (c)=='D')
703 #define is_e(c) ((c)=='e' || (c)=='E')
704 #define is_i(c) ((c)=='i' || (c)=='I')
705 #define is_l(c) ((c)=='l' || (c)=='L')
706 #define is_r(c) ((c)=='r' || (c)=='R')
707 #define is_t(c) ((c)=='t' || (c)=='T')
708 #define is_u(c) ((c)=='u' || (c)=='U')
709 #define is_z(c) ((c)=='z' || (c)=='Z')
710 
711 /* separator? */
712 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
713 
714 /**
715  * Requires non-NULL locale ID but otherwise does the equivalent of
716  * checking for language codes as if uloc_getLanguage() were called:
717  * Accepts both 2- and 3-letter codes and accepts case variants.
718  */
719 U_CFUNC int32_t
ucase_getCaseLocale(const char * locale)720 ucase_getCaseLocale(const char *locale) {
721     /*
722      * This function used to use uloc_getLanguage(), but the current code
723      * removes the dependency of this low-level code on uloc implementation code
724      * and is faster because not the whole locale ID has to be
725      * examined and copied/transformed.
726      *
727      * Because this code does not want to depend on uloc, the caller must
728      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
729      */
730     char c=*locale++;
731     // Fastpath for English "en" which is often used for default (=root locale) case mappings,
732     // and for Chinese "zh": Very common but no special case mapping behavior.
733     // Then check lowercase vs. uppercase to reduce the number of comparisons
734     // for other locales without special behavior.
735     if(c=='e') {
736         /* el or ell? */
737         c=*locale++;
738         if(is_l(c)) {
739             c=*locale++;
740             if(is_l(c)) {
741                 c=*locale;
742             }
743             if(is_sep(c)) {
744                 return UCASE_LOC_GREEK;
745             }
746         }
747         // en, es, ... -> root
748     } else if(c=='z') {
749         return UCASE_LOC_ROOT;
750 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
751     } else if(c>='a') {  // ASCII a-z = 0x61..0x7a, after A-Z
752 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
753     } else if(c<='z') {  // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
754 #else
755 #   error Unknown charset family!
756 #endif
757         // lowercase c
758         if(c=='t') {
759             /* tr or tur? */
760             c=*locale++;
761             if(is_u(c)) {
762                 c=*locale++;
763             }
764             if(is_r(c)) {
765                 c=*locale;
766                 if(is_sep(c)) {
767                     return UCASE_LOC_TURKISH;
768                 }
769             }
770         } else if(c=='a') {
771             /* az or aze? */
772             c=*locale++;
773             if(is_z(c)) {
774                 c=*locale++;
775                 if(is_e(c)) {
776                     c=*locale;
777                 }
778                 if(is_sep(c)) {
779                     return UCASE_LOC_TURKISH;
780                 }
781             }
782         } else if(c=='l') {
783             /* lt or lit? */
784             c=*locale++;
785             if(is_i(c)) {
786                 c=*locale++;
787             }
788             if(is_t(c)) {
789                 c=*locale;
790                 if(is_sep(c)) {
791                     return UCASE_LOC_LITHUANIAN;
792                 }
793             }
794         } else if(c=='n') {
795             /* nl or nld? */
796             c=*locale++;
797             if(is_l(c)) {
798                 c=*locale++;
799                 if(is_d(c)) {
800                     c=*locale;
801                 }
802                 if(is_sep(c)) {
803                     return UCASE_LOC_DUTCH;
804                 }
805             }
806         }
807     } else {
808         // uppercase c
809         // Same code as for lowercase c but also check for 'E'.
810         if(c=='T') {
811             /* tr or tur? */
812             c=*locale++;
813             if(is_u(c)) {
814                 c=*locale++;
815             }
816             if(is_r(c)) {
817                 c=*locale;
818                 if(is_sep(c)) {
819                     return UCASE_LOC_TURKISH;
820                 }
821             }
822         } else if(c=='A') {
823             /* az or aze? */
824             c=*locale++;
825             if(is_z(c)) {
826                 c=*locale++;
827                 if(is_e(c)) {
828                     c=*locale;
829                 }
830                 if(is_sep(c)) {
831                     return UCASE_LOC_TURKISH;
832                 }
833             }
834         } else if(c=='L') {
835             /* lt or lit? */
836             c=*locale++;
837             if(is_i(c)) {
838                 c=*locale++;
839             }
840             if(is_t(c)) {
841                 c=*locale;
842                 if(is_sep(c)) {
843                     return UCASE_LOC_LITHUANIAN;
844                 }
845             }
846         } else if(c=='E') {
847             /* el or ell? */
848             c=*locale++;
849             if(is_l(c)) {
850                 c=*locale++;
851                 if(is_l(c)) {
852                     c=*locale;
853                 }
854                 if(is_sep(c)) {
855                     return UCASE_LOC_GREEK;
856                 }
857             }
858         } else if(c=='N') {
859             /* nl or nld? */
860             c=*locale++;
861             if(is_l(c)) {
862                 c=*locale++;
863                 if(is_d(c)) {
864                     c=*locale;
865                 }
866                 if(is_sep(c)) {
867                     return UCASE_LOC_DUTCH;
868                 }
869             }
870         }
871     }
872     return UCASE_LOC_ROOT;
873 }
874 
875 /*
876  * Is followed by
877  *   {case-ignorable}* cased
878  * ?
879  * (dir determines looking forward/backward)
880  * If a character is case-ignorable, it is skipped regardless of whether
881  * it is also cased or not.
882  */
883 static UBool
isFollowedByCasedLetter(UCaseContextIterator * iter,void * context,int8_t dir)884 isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
885     UChar32 c;
886 
887     if(iter==NULL) {
888         return FALSE;
889     }
890 
891     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
892         int32_t type=ucase_getTypeOrIgnorable(c);
893         if(type&4) {
894             /* case-ignorable, continue with the loop */
895         } else if(type!=UCASE_NONE) {
896             return TRUE; /* followed by cased letter */
897         } else {
898             return FALSE; /* uncased and not case-ignorable */
899         }
900     }
901 
902     return FALSE; /* not followed by cased letter */
903 }
904 
905 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
906 static UBool
isPrecededBySoftDotted(UCaseContextIterator * iter,void * context)907 isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
908     UChar32 c;
909     int32_t dotType;
910     int8_t dir;
911 
912     if(iter==NULL) {
913         return FALSE;
914     }
915 
916     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
917         dotType=getDotType(c);
918         if(dotType==UCASE_SOFT_DOTTED) {
919             return TRUE; /* preceded by TYPE_i */
920         } else if(dotType!=UCASE_OTHER_ACCENT) {
921             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
922         }
923     }
924 
925     return FALSE; /* not preceded by TYPE_i */
926 }
927 
928 /*
929  * See Jitterbug 2344:
930  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
931  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
932  * we made those releases compatible with Unicode 3.2 which had not fixed
933  * a related bug in SpecialCasing.txt.
934  *
935  * From the Jitterbug 2344 text:
936  * ... this bug is listed as a Unicode erratum
937  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
938  * <quote>
939  * There are two errors in SpecialCasing.txt.
940  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
941  * 2. An incorrect context definition. Correct as follows:
942  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
943  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
944  * ---
945  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
946  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
947  * where the context After_I is defined as:
948  * The last preceding base character was an uppercase I, and there is no
949  * intervening combining character class 230 (ABOVE).
950  * </quote>
951  *
952  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
953  *
954  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
955  * # This matches the behavior of the canonically equivalent I-dot_above
956  *
957  * See also the description in this place in older versions of uchar.c (revision 1.100).
958  *
959  * Markus W. Scherer 2003-feb-15
960  */
961 
962 /* Is preceded by base character 'I' with no intervening cc=230 ? */
963 static UBool
isPrecededBy_I(UCaseContextIterator * iter,void * context)964 isPrecededBy_I(UCaseContextIterator *iter, void *context) {
965     UChar32 c;
966     int32_t dotType;
967     int8_t dir;
968 
969     if(iter==NULL) {
970         return FALSE;
971     }
972 
973     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
974         if(c==0x49) {
975             return TRUE; /* preceded by I */
976         }
977         dotType=getDotType(c);
978         if(dotType!=UCASE_OTHER_ACCENT) {
979             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
980         }
981     }
982 
983     return FALSE; /* not preceded by I */
984 }
985 
986 /* Is followed by one or more cc==230 ? */
987 static UBool
isFollowedByMoreAbove(UCaseContextIterator * iter,void * context)988 isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
989     UChar32 c;
990     int32_t dotType;
991     int8_t dir;
992 
993     if(iter==NULL) {
994         return FALSE;
995     }
996 
997     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
998         dotType=getDotType(c);
999         if(dotType==UCASE_ABOVE) {
1000             return TRUE; /* at least one cc==230 following */
1001         } else if(dotType!=UCASE_OTHER_ACCENT) {
1002             return FALSE; /* next base character, no more cc==230 following */
1003         }
1004     }
1005 
1006     return FALSE; /* no more cc==230 following */
1007 }
1008 
1009 /* Is followed by a dot above (without cc==230 in between) ? */
1010 static UBool
isFollowedByDotAbove(UCaseContextIterator * iter,void * context)1011 isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
1012     UChar32 c;
1013     int32_t dotType;
1014     int8_t dir;
1015 
1016     if(iter==NULL) {
1017         return FALSE;
1018     }
1019 
1020     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1021         if(c==0x307) {
1022             return TRUE;
1023         }
1024         dotType=getDotType(c);
1025         if(dotType!=UCASE_OTHER_ACCENT) {
1026             return FALSE; /* next base character or cc==230 in between */
1027         }
1028     }
1029 
1030     return FALSE; /* no dot above following */
1031 }
1032 
1033 U_CAPI int32_t U_EXPORT2
ucase_toFullLower(UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,int32_t loc)1034 ucase_toFullLower(UChar32 c,
1035                   UCaseContextIterator *iter, void *context,
1036                   const UChar **pString,
1037                   int32_t loc) {
1038     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1039     U_ASSERT(c >= 0);
1040     UChar32 result=c;
1041     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1042     if(!UCASE_HAS_EXCEPTION(props)) {
1043         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1044             result=c+UCASE_GET_DELTA(props);
1045         }
1046     } else {
1047         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1048         uint16_t excWord=*pe++;
1049         int32_t full;
1050 
1051         pe2=pe;
1052 
1053         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1054             /* use hardcoded conditions and mappings */
1055 
1056             /*
1057              * Test for conditional mappings first
1058              *   (otherwise the unconditional default mappings are always taken),
1059              * then test for characters that have unconditional mappings in SpecialCasing.txt,
1060              * then get the UnicodeData.txt mappings.
1061              */
1062             if( loc==UCASE_LOC_LITHUANIAN &&
1063                     /* base characters, find accents above */
1064                     (((c==0x49 || c==0x4a || c==0x12e) &&
1065                         isFollowedByMoreAbove(iter, context)) ||
1066                     /* precomposed with accent above, no need to find one */
1067                     (c==0xcc || c==0xcd || c==0x128))
1068             ) {
1069                 /*
1070                     # Lithuanian
1071 
1072                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1073 
1074                     # Introduce an explicit dot above when lowercasing capital I's and J's
1075                     # whenever there are more accents above.
1076                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1077 
1078                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1079                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1080                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1081                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1082                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1083                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1084                  */
1085                 switch(c) {
1086                 case 0x49:  /* LATIN CAPITAL LETTER I */
1087                     *pString=iDot;
1088                     return 2;
1089                 case 0x4a:  /* LATIN CAPITAL LETTER J */
1090                     *pString=jDot;
1091                     return 2;
1092                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1093                     *pString=iOgonekDot;
1094                     return 2;
1095                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
1096                     *pString=iDotGrave;
1097                     return 3;
1098                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
1099                     *pString=iDotAcute;
1100                     return 3;
1101                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1102                     *pString=iDotTilde;
1103                     return 3;
1104                 default:
1105                     return 0; /* will not occur */
1106                 }
1107             /* # Turkish and Azeri */
1108             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1109                 /*
1110                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1111                     # The following rules handle those cases.
1112 
1113                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1114                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1115                  */
1116                 return 0x69;
1117             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
1118                 /*
1119                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1120                     # This matches the behavior of the canonically equivalent I-dot_above
1121 
1122                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1123                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1124                  */
1125                 *pString=nullptr;
1126                 return 0; /* remove the dot (continue without output) */
1127             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
1128                 /*
1129                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1130 
1131                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1132                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1133                  */
1134                 return 0x131;
1135             } else if(c==0x130) {
1136                 /*
1137                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
1138 
1139                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1140                  */
1141                 *pString=iDot;
1142                 return 2;
1143             } else if(  c==0x3a3 &&
1144                         !isFollowedByCasedLetter(iter, context, 1) &&
1145                         isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
1146             ) {
1147                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1148                 /*
1149                     # Special case for final form of sigma
1150 
1151                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1152                  */
1153                 return 0x3c2; /* greek small final sigma */
1154             } else {
1155                 /* no known conditional special case mapping, use a normal mapping */
1156             }
1157         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1158             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1159             full&=UCASE_FULL_LOWER;
1160             if(full!=0) {
1161                 /* set the output pointer to the lowercase mapping */
1162                 *pString=reinterpret_cast<const UChar *>(pe+1);
1163 
1164                 /* return the string length */
1165                 return full;
1166             }
1167         }
1168 
1169         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1170             int32_t delta;
1171             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1172             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1173         }
1174         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1175             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1176         }
1177     }
1178 
1179     return (result==c) ? ~result : result;
1180 }
1181 
1182 /* internal */
1183 static int32_t
toUpperOrTitle(UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,int32_t loc,UBool upperNotTitle)1184 toUpperOrTitle(UChar32 c,
1185                UCaseContextIterator *iter, void *context,
1186                const UChar **pString,
1187                int32_t loc,
1188                UBool upperNotTitle) {
1189     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1190     U_ASSERT(c >= 0);
1191     UChar32 result=c;
1192     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1193     if(!UCASE_HAS_EXCEPTION(props)) {
1194         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1195             result=c+UCASE_GET_DELTA(props);
1196         }
1197     } else {
1198         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1199         uint16_t excWord=*pe++;
1200         int32_t full, idx;
1201 
1202         pe2=pe;
1203 
1204         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1205             /* use hardcoded conditions and mappings */
1206             if(loc==UCASE_LOC_TURKISH && c==0x69) {
1207                 /*
1208                     # Turkish and Azeri
1209 
1210                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1211                     # The following rules handle those cases.
1212 
1213                     # When uppercasing, i turns into a dotted capital I
1214 
1215                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1216                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1217                 */
1218                 return 0x130;
1219             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
1220                 /*
1221                     # Lithuanian
1222 
1223                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1224 
1225                     # Remove DOT ABOVE after "i" with upper or titlecase
1226 
1227                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1228                  */
1229                 *pString=nullptr;
1230                 return 0; /* remove the dot (continue without output) */
1231             } else {
1232                 /* no known conditional special case mapping, use a normal mapping */
1233             }
1234         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1235             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1236 
1237             /* start of full case mapping strings */
1238             ++pe;
1239 
1240             /* skip the lowercase and case-folding result strings */
1241             pe+=full&UCASE_FULL_LOWER;
1242             full>>=4;
1243             pe+=full&0xf;
1244             full>>=4;
1245 
1246             if(upperNotTitle) {
1247                 full&=0xf;
1248             } else {
1249                 /* skip the uppercase result string */
1250                 pe+=full&0xf;
1251                 full=(full>>4)&0xf;
1252             }
1253 
1254             if(full!=0) {
1255                 /* set the output pointer to the result string */
1256                 *pString=reinterpret_cast<const UChar *>(pe);
1257 
1258                 /* return the string length */
1259                 return full;
1260             }
1261         }
1262 
1263         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1264             int32_t delta;
1265             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1266             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1267         }
1268         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1269             idx=UCASE_EXC_TITLE;
1270         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1271             /* here, titlecase is same as uppercase */
1272             idx=UCASE_EXC_UPPER;
1273         } else {
1274             return ~c;
1275         }
1276         GET_SLOT_VALUE(excWord, idx, pe2, result);
1277     }
1278 
1279     return (result==c) ? ~result : result;
1280 }
1281 
1282 U_CAPI int32_t U_EXPORT2
ucase_toFullUpper(UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,int32_t caseLocale)1283 ucase_toFullUpper(UChar32 c,
1284                   UCaseContextIterator *iter, void *context,
1285                   const UChar **pString,
1286                   int32_t caseLocale) {
1287     return toUpperOrTitle(c, iter, context, pString, caseLocale, TRUE);
1288 }
1289 
1290 U_CAPI int32_t U_EXPORT2
ucase_toFullTitle(UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,int32_t caseLocale)1291 ucase_toFullTitle(UChar32 c,
1292                   UCaseContextIterator *iter, void *context,
1293                   const UChar **pString,
1294                   int32_t caseLocale) {
1295     return toUpperOrTitle(c, iter, context, pString, caseLocale, FALSE);
1296 }
1297 
1298 /* case folding ------------------------------------------------------------- */
1299 
1300 /*
1301  * Case folding is similar to lowercasing.
1302  * The result may be a simple mapping, i.e., a single code point, or
1303  * a full mapping, i.e., a string.
1304  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1305  * then only the lowercase mapping is stored.
1306  *
1307  * Some special cases are hardcoded because their conditions cannot be
1308  * parsed and processed from CaseFolding.txt.
1309  *
1310  * Unicode 3.2 CaseFolding.txt specifies for its status field:
1311 
1312 # C: common case folding, common mappings shared by both simple and full mappings.
1313 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1314 # S: simple case folding, mappings to single characters where different from F.
1315 # T: special case for uppercase I and dotted uppercase I
1316 #    - For non-Turkic languages, this mapping is normally not used.
1317 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1318 #
1319 # Usage:
1320 #  A. To do a simple case folding, use the mappings with status C + S.
1321 #  B. To do a full case folding, use the mappings with status C + F.
1322 #
1323 #    The mappings with status T can be used or omitted depending on the desired case-folding
1324 #    behavior. (The default option is to exclude them.)
1325 
1326  * Unicode 3.2 has 'T' mappings as follows:
1327 
1328 0049; T; 0131; # LATIN CAPITAL LETTER I
1329 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1330 
1331  * while the default mappings for these code points are:
1332 
1333 0049; C; 0069; # LATIN CAPITAL LETTER I
1334 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1335 
1336  * U+0130 has no simple case folding (simple-case-folds to itself).
1337  */
1338 
1339 /* return the simple case folding mapping for c */
1340 U_CAPI UChar32 U_EXPORT2
ucase_fold(UChar32 c,uint32_t options)1341 ucase_fold(UChar32 c, uint32_t options) {
1342     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1343     if(!UCASE_HAS_EXCEPTION(props)) {
1344         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1345             c+=UCASE_GET_DELTA(props);
1346         }
1347     } else {
1348         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
1349         uint16_t excWord=*pe++;
1350         int32_t idx;
1351         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1352             /* special case folding mappings, hardcoded */
1353             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1354                 /* default mappings */
1355                 if(c==0x49) {
1356                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1357                     return 0x69;
1358                 } else if(c==0x130) {
1359                     /* no simple case folding for U+0130 */
1360                     return c;
1361                 }
1362             } else {
1363                 /* Turkic mappings */
1364                 if(c==0x49) {
1365                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1366                     return 0x131;
1367                 } else if(c==0x130) {
1368                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1369                     return 0x69;
1370                 }
1371             }
1372         }
1373         if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1374             return c;
1375         }
1376         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1377             int32_t delta;
1378             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1379             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1380         }
1381         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1382             idx=UCASE_EXC_FOLD;
1383         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1384             idx=UCASE_EXC_LOWER;
1385         } else {
1386             return c;
1387         }
1388         GET_SLOT_VALUE(excWord, idx, pe, c);
1389     }
1390     return c;
1391 }
1392 
1393 /*
1394  * Issue for canonical caseless match (UAX #21):
1395  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1396  * canonical equivalence, unlike default-option casefolding.
1397  * For example, I-grave and I + grave fold to strings that are not canonically
1398  * equivalent.
1399  * For more details, see the comment in unorm_compare() in unorm.cpp
1400  * and the intermediate prototype changes for Jitterbug 2021.
1401  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1402  *
1403  * This did not get fixed because it appears that it is not possible to fix
1404  * it for uppercase and lowercase characters (I-grave vs. i-grave)
1405  * together in a way that they still fold to common result strings.
1406  */
1407 
1408 U_CAPI int32_t U_EXPORT2
ucase_toFullFolding(UChar32 c,const UChar ** pString,uint32_t options)1409 ucase_toFullFolding(UChar32 c,
1410                     const UChar **pString,
1411                     uint32_t options) {
1412     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1413     U_ASSERT(c >= 0);
1414     UChar32 result=c;
1415     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1416     if(!UCASE_HAS_EXCEPTION(props)) {
1417         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1418             result=c+UCASE_GET_DELTA(props);
1419         }
1420     } else {
1421         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1422         uint16_t excWord=*pe++;
1423         int32_t full, idx;
1424 
1425         pe2=pe;
1426 
1427         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1428             /* use hardcoded conditions and mappings */
1429             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1430                 /* default mappings */
1431                 if(c==0x49) {
1432                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1433                     return 0x69;
1434                 } else if(c==0x130) {
1435                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1436                     *pString=iDot;
1437                     return 2;
1438                 }
1439             } else {
1440                 /* Turkic mappings */
1441                 if(c==0x49) {
1442                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1443                     return 0x131;
1444                 } else if(c==0x130) {
1445                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1446                     return 0x69;
1447                 }
1448             }
1449         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1450             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1451 
1452             /* start of full case mapping strings */
1453             ++pe;
1454 
1455             /* skip the lowercase result string */
1456             pe+=full&UCASE_FULL_LOWER;
1457             full=(full>>4)&0xf;
1458 
1459             if(full!=0) {
1460                 /* set the output pointer to the result string */
1461                 *pString=reinterpret_cast<const UChar *>(pe);
1462 
1463                 /* return the string length */
1464                 return full;
1465             }
1466         }
1467 
1468         if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1469             return ~c;
1470         }
1471         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1472             int32_t delta;
1473             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1474             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1475         }
1476         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1477             idx=UCASE_EXC_FOLD;
1478         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1479             idx=UCASE_EXC_LOWER;
1480         } else {
1481             return ~c;
1482         }
1483         GET_SLOT_VALUE(excWord, idx, pe2, result);
1484     }
1485 
1486     return (result==c) ? ~result : result;
1487 }
1488 
1489 /* case mapping properties API ---------------------------------------------- */
1490 
1491 /* public API (see uchar.h) */
1492 
1493 U_CAPI UBool U_EXPORT2
u_isULowercase(UChar32 c)1494 u_isULowercase(UChar32 c) {
1495     return (UBool)(UCASE_LOWER==ucase_getType(c));
1496 }
1497 
1498 U_CAPI UBool U_EXPORT2
u_isUUppercase(UChar32 c)1499 u_isUUppercase(UChar32 c) {
1500     return (UBool)(UCASE_UPPER==ucase_getType(c));
1501 }
1502 
1503 /* Transforms the Unicode character to its lower case equivalent.*/
1504 U_CAPI UChar32 U_EXPORT2
u_tolower(UChar32 c)1505 u_tolower(UChar32 c) {
1506     return ucase_tolower(c);
1507 }
1508 
1509 /* Transforms the Unicode character to its upper case equivalent.*/
1510 U_CAPI UChar32 U_EXPORT2
u_toupper(UChar32 c)1511 u_toupper(UChar32 c) {
1512     return ucase_toupper(c);
1513 }
1514 
1515 /* Transforms the Unicode character to its title case equivalent.*/
1516 U_CAPI UChar32 U_EXPORT2
u_totitle(UChar32 c)1517 u_totitle(UChar32 c) {
1518     return ucase_totitle(c);
1519 }
1520 
1521 /* return the simple case folding mapping for c */
1522 U_CAPI UChar32 U_EXPORT2
u_foldCase(UChar32 c,uint32_t options)1523 u_foldCase(UChar32 c, uint32_t options) {
1524     return ucase_fold(c, options);
1525 }
1526 
1527 U_CFUNC int32_t U_EXPORT2
ucase_hasBinaryProperty(UChar32 c,UProperty which)1528 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1529     /* case mapping properties */
1530     const UChar *resultString;
1531     switch(which) {
1532     case UCHAR_LOWERCASE:
1533         return (UBool)(UCASE_LOWER==ucase_getType(c));
1534     case UCHAR_UPPERCASE:
1535         return (UBool)(UCASE_UPPER==ucase_getType(c));
1536     case UCHAR_SOFT_DOTTED:
1537         return ucase_isSoftDotted(c);
1538     case UCHAR_CASE_SENSITIVE:
1539         return ucase_isCaseSensitive(c);
1540     case UCHAR_CASED:
1541         return (UBool)(UCASE_NONE!=ucase_getType(c));
1542     case UCHAR_CASE_IGNORABLE:
1543         return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
1544     /*
1545      * Note: The following Changes_When_Xyz are defined as testing whether
1546      * the NFD form of the input changes when Xyz-case-mapped.
1547      * However, this simpler implementation of these properties,
1548      * ignoring NFD, passes the tests.
1549      * The implementation needs to be changed if the tests start failing.
1550      * When that happens, optimizations should be used to work with the
1551      * per-single-code point ucase_toFullXyz() functions unless
1552      * the NFD form has more than one code point,
1553      * and the property starts set needs to be the union of the
1554      * start sets for normalization and case mappings.
1555      */
1556     case UCHAR_CHANGES_WHEN_LOWERCASED:
1557         return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1558     case UCHAR_CHANGES_WHEN_UPPERCASED:
1559         return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1560     case UCHAR_CHANGES_WHEN_TITLECASED:
1561         return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1562     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1563     case UCHAR_CHANGES_WHEN_CASEMAPPED:
1564         return (UBool)(
1565             ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1566             ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1567             ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1568     default:
1569         return FALSE;
1570     }
1571 }
1572