1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2004-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucase.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2004aug30
16 *   created by: Markus W. Scherer
17 *
18 *   Low-level Unicode character/string case mapping code.
19 *   Much code moved here (and modified) from uchar.c.
20 */
21 
22 #include "unicode/utypes.h"
23 #include "unicode/unistr.h"
24 #include "unicode/uset.h"
25 #include "unicode/udata.h" /* UDataInfo */
26 #include "unicode/utf16.h"
27 #include "ucmndata.h" /* DataHeader */
28 #include "udatamem.h"
29 #include "umutex.h"
30 #include "uassert.h"
31 #include "cmemory.h"
32 #include "utrie2.h"
33 #include "ucase.h"
34 
35 struct UCaseProps {
36     UDataMemory *mem;
37     const int32_t *indexes;
38     const uint16_t *exceptions;
39     const uint16_t *unfold;
40 
41     UTrie2 trie;
42     uint8_t formatVersion[4];
43 };
44 
45 /* ucase_props_data.h is machine-generated by gencase --csource */
46 #define INCLUDED_FROM_UCASE_CPP
47 #include "ucase_props_data.h"
48 
49 /* set of property starts for UnicodeSet ------------------------------------ */
50 
51 static UBool U_CALLCONV
_enumPropertyStartsRange(const void * context,UChar32 start,UChar32,uint32_t)52 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
53     /* add the start code point to the USet */
54     const USetAdder *sa=(const USetAdder *)context;
55     sa->add(sa->set, start);
56     return TRUE;
57 }
58 
59 U_CFUNC void U_EXPORT2
ucase_addPropertyStarts(const USetAdder * sa,UErrorCode * pErrorCode)60 ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
61     if(U_FAILURE(*pErrorCode)) {
62         return;
63     }
64 
65     /* add the start code point of each same-value range of the trie */
66     utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
67 
68     /* add code points with hardcoded properties, plus the ones following them */
69 
70     /* (none right now, see comment below) */
71 
72     /*
73      * Omit code points with hardcoded specialcasing properties
74      * because we do not build property UnicodeSets for them right now.
75      */
76 }
77 
78 /* data access primitives --------------------------------------------------- */
79 
80 U_CFUNC const UTrie2 * U_EXPORT2
ucase_getTrie()81 ucase_getTrie() {
82     return &ucase_props_singleton.trie;
83 }
84 
85 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
86 
87 /* number of bits in an 8-bit integer value */
88 static const uint8_t flagsOffset[256]={
89     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
90     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
91     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
92     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
93     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
94     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
95     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
96     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
97     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
98     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
99     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
100     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
101     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
102     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
103     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
104     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
105 };
106 
107 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
108 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
109 
110 /*
111  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
112  *
113  * @param excWord (in) initial exceptions word
114  * @param idx (in) desired slot index
115  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
116  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
117  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
118  */
119 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) UPRV_BLOCK_MACRO_BEGIN { \
120     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
121         (pExc16)+=SLOT_OFFSET(excWord, idx); \
122         (value)=*pExc16; \
123     } else { \
124         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
125         (value)=*pExc16++; \
126         (value)=((value)<<16)|*pExc16; \
127     } \
128 } UPRV_BLOCK_MACRO_END
129 
130 /* simple case mappings ----------------------------------------------------- */
131 
132 U_CAPI UChar32 U_EXPORT2
ucase_tolower(UChar32 c)133 ucase_tolower(UChar32 c) {
134     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
135     if(!UCASE_HAS_EXCEPTION(props)) {
136         if(UCASE_IS_UPPER_OR_TITLE(props)) {
137             c+=UCASE_GET_DELTA(props);
138         }
139     } else {
140         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
141         uint16_t excWord=*pe++;
142         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
143             int32_t delta;
144             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
145             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
146         }
147         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
148             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
149         }
150     }
151     return c;
152 }
153 
154 U_CAPI UChar32 U_EXPORT2
ucase_toupper(UChar32 c)155 ucase_toupper(UChar32 c) {
156     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
157     if(!UCASE_HAS_EXCEPTION(props)) {
158         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
159             c+=UCASE_GET_DELTA(props);
160         }
161     } else {
162         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
163         uint16_t excWord=*pe++;
164         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
165             int32_t delta;
166             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
167             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
168         }
169         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
170             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
171         }
172     }
173     return c;
174 }
175 
176 U_CAPI UChar32 U_EXPORT2
ucase_totitle(UChar32 c)177 ucase_totitle(UChar32 c) {
178     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
179     if(!UCASE_HAS_EXCEPTION(props)) {
180         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
181             c+=UCASE_GET_DELTA(props);
182         }
183     } else {
184         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
185         uint16_t excWord=*pe++;
186         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
187             int32_t delta;
188             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
189             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
190         }
191         int32_t idx;
192         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
193             idx=UCASE_EXC_TITLE;
194         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
195             idx=UCASE_EXC_UPPER;
196         } else {
197             return c;
198         }
199         GET_SLOT_VALUE(excWord, idx, pe, c);
200     }
201     return c;
202 }
203 
204 static const UChar iDot[2] = { 0x69, 0x307 };
205 static const UChar jDot[2] = { 0x6a, 0x307 };
206 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
207 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
208 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
209 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
210 
211 
212 U_CFUNC void U_EXPORT2
ucase_addCaseClosure(UChar32 c,const USetAdder * sa)213 ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
214     uint16_t props;
215 
216     /*
217      * Hardcode the case closure of i and its relatives and ignore the
218      * data file data for these characters.
219      * The Turkic dotless i and dotted I with their case mapping conditions
220      * and case folding option make the related characters behave specially.
221      * This code matches their closure behavior to their case folding behavior.
222      */
223 
224     switch(c) {
225     case 0x49:
226         /* regular i and I are in one equivalence class */
227         sa->add(sa->set, 0x69);
228         return;
229     case 0x69:
230         sa->add(sa->set, 0x49);
231         return;
232     case 0x130:
233         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
234         sa->addString(sa->set, iDot, 2);
235         return;
236     case 0x131:
237         /* dotless i is in a class by itself */
238         return;
239     default:
240         /* otherwise use the data file data */
241         break;
242     }
243 
244     props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
245     if(!UCASE_HAS_EXCEPTION(props)) {
246         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
247             /* add the one simple case mapping, no matter what type it is */
248             int32_t delta=UCASE_GET_DELTA(props);
249             if(delta!=0) {
250                 sa->add(sa->set, c+delta);
251             }
252         }
253     } else {
254         /*
255          * c has exceptions, so there may be multiple simple and/or
256          * full case mappings. Add them all.
257          */
258         const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
259         const UChar *closure;
260         uint16_t excWord=*pe++;
261         int32_t idx, closureLength, fullLength, length;
262 
263         pe0=pe;
264 
265         /* add all simple case mappings */
266         for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
267             if(HAS_SLOT(excWord, idx)) {
268                 pe=pe0;
269                 GET_SLOT_VALUE(excWord, idx, pe, c);
270                 sa->add(sa->set, c);
271             }
272         }
273         if(HAS_SLOT(excWord, UCASE_EXC_DELTA)) {
274             pe=pe0;
275             int32_t delta;
276             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
277             sa->add(sa->set, (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta);
278         }
279 
280         /* get the closure string pointer & length */
281         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
282             pe=pe0;
283             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
284             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
285             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
286         } else {
287             closureLength=0;
288             closure=NULL;
289         }
290 
291         /* add the full case folding */
292         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
293             pe=pe0;
294             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
295 
296             /* start of full case mapping strings */
297             ++pe;
298 
299             fullLength&=0xffff; /* bits 16 and higher are reserved */
300 
301             /* skip the lowercase result string */
302             pe+=fullLength&UCASE_FULL_LOWER;
303             fullLength>>=4;
304 
305             /* add the full case folding string */
306             length=fullLength&0xf;
307             if(length!=0) {
308                 sa->addString(sa->set, (const UChar *)pe, length);
309                 pe+=length;
310             }
311 
312             /* skip the uppercase and titlecase strings */
313             fullLength>>=4;
314             pe+=fullLength&0xf;
315             fullLength>>=4;
316             pe+=fullLength;
317 
318             closure=(const UChar *)pe; /* behind full case mappings */
319         }
320 
321         /* add each code point in the closure string */
322         for(idx=0; idx<closureLength;) {
323             U16_NEXT_UNSAFE(closure, idx, c);
324             sa->add(sa->set, c);
325         }
326     }
327 }
328 
329 /*
330  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
331  * must be length>0 and max>0 and length<=max
332  */
333 static inline int32_t
strcmpMax(const UChar * s,int32_t length,const UChar * t,int32_t max)334 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
335     int32_t c1, c2;
336 
337     max-=length; /* we require length<=max, so no need to decrement max in the loop */
338     do {
339         c1=*s++;
340         c2=*t++;
341         if(c2==0) {
342             return 1; /* reached the end of t but not of s */
343         }
344         c1-=c2;
345         if(c1!=0) {
346             return c1; /* return difference result */
347         }
348     } while(--length>0);
349     /* ends with length==0 */
350 
351     if(max==0 || *t==0) {
352         return 0; /* equal to length of both strings */
353     } else {
354         return -max; /* return lengh difference */
355     }
356 }
357 
358 U_CFUNC UBool U_EXPORT2
ucase_addStringCaseClosure(const UChar * s,int32_t length,const USetAdder * sa)359 ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) {
360     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
361 
362     if(ucase_props_singleton.unfold==NULL || s==NULL) {
363         return FALSE; /* no reverse case folding data, or no string */
364     }
365     if(length<=1) {
366         /* the string is too short to find any match */
367         /*
368          * more precise would be:
369          * if(!u_strHasMoreChar32Than(s, length, 1))
370          * but this does not make much practical difference because
371          * a single supplementary code point would just not be found
372          */
373         return FALSE;
374     }
375 
376     const uint16_t *unfold=ucase_props_singleton.unfold;
377     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
378     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
379     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
380     unfold+=unfoldRowWidth;
381 
382     if(length>unfoldStringWidth) {
383         /* the string is too long to find any match */
384         return FALSE;
385     }
386 
387     /* do a binary search for the string */
388     start=0;
389     limit=unfoldRows;
390     while(start<limit) {
391         i=(start+limit)/2;
392         const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
393         result=strcmpMax(s, length, p, unfoldStringWidth);
394 
395         if(result==0) {
396             /* found the string: add each code point, and its case closure */
397             UChar32 c;
398 
399             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
400                 U16_NEXT_UNSAFE(p, i, c);
401                 sa->add(sa->set, c);
402                 ucase_addCaseClosure(c, sa);
403             }
404             return TRUE;
405         } else if(result<0) {
406             limit=i;
407         } else /* result>0 */ {
408             start=i+1;
409         }
410     }
411 
412     return FALSE; /* string not found */
413 }
414 
415 U_NAMESPACE_BEGIN
416 
FullCaseFoldingIterator()417 FullCaseFoldingIterator::FullCaseFoldingIterator()
418         : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
419           unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
420           unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
421           unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
422           currentRow(0),
423           rowCpIndex(unfoldStringWidth) {
424     unfold+=unfoldRowWidth;
425 }
426 
427 UChar32
next(UnicodeString & full)428 FullCaseFoldingIterator::next(UnicodeString &full) {
429     // Advance past the last-delivered code point.
430     const UChar *p=unfold+(currentRow*unfoldRowWidth);
431     if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
432         ++currentRow;
433         p+=unfoldRowWidth;
434         rowCpIndex=unfoldStringWidth;
435     }
436     if(currentRow>=unfoldRows) { return U_SENTINEL; }
437     // Set "full" to the NUL-terminated string in the first unfold column.
438     int32_t length=unfoldStringWidth;
439     while(length>0 && p[length-1]==0) { --length; }
440     full.setTo(FALSE, p, length);
441     // Return the code point.
442     UChar32 c;
443     U16_NEXT_UNSAFE(p, rowCpIndex, c);
444     return c;
445 }
446 
447 namespace LatinCase {
448 
449 const int8_t TO_LOWER_NORMAL[LIMIT] = {
450     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
451     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
452     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
453     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
454 
455     0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
456     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
457     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
459 
460     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
461     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
462     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
463     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
464 
465     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
466     32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
467     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
468     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
469 
470     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
471     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
472     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
473     EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
474 
475     0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
476     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
477     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
478     1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
479 };
480 
481 const int8_t TO_LOWER_TR_LT[LIMIT] = {
482     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
483     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
484     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
485     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
486 
487     0, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32, 32, 32, 32,
488     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0,
489     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
490     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
491 
492     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
496 
497     32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, EXC, EXC, 32, 32,
498     32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, EXC,
499     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
500     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
501 
502     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
503     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
504     1, 0, 1, 0, 1, 0, 1, 0, EXC, 0, 1, 0, 1, 0, EXC, 0,
505     EXC, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
506 
507     0, 1, 0, 1, 0, 1, 0, 1, 0, EXC, 1, 0, 1, 0, 1, 0,
508     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
509     1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
510     1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, EXC
511 };
512 
513 const int8_t TO_UPPER_NORMAL[LIMIT] = {
514     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
515     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
516     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
517     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
518 
519     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
520     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
521     0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
522     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
523 
524     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
525     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
526     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
527     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
528 
529     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
530     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
531     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
532     -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
533 
534     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
535     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
536     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
537     0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
538 
539     -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
540     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
541     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
542     0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
543 };
544 
545 const int8_t TO_UPPER_TR[LIMIT] = {
546     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
548     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550 
551     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
552     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
553     0, -32, -32, -32, -32, -32, -32, -32, -32, EXC, -32, -32, -32, -32, -32, -32,
554     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0,
555 
556     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
557     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
558     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
559     0, 0, 0, 0, 0, EXC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
560 
561     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
562     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, EXC,
563     -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
564     -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121,
565 
566     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
567     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
568     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
569     0, EXC, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0,
570 
571     -1, 0, -1, 0, -1, 0, -1, 0, -1, EXC, 0, -1, 0, -1, 0, -1,
572     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
573     0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,
574     0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, EXC
575 };
576 
577 }  // namespace LatinCase
578 
579 U_NAMESPACE_END
580 
581 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
582 U_CAPI int32_t U_EXPORT2
ucase_getType(UChar32 c)583 ucase_getType(UChar32 c) {
584     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
585     return UCASE_GET_TYPE(props);
586 }
587 
588 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
589 U_CAPI int32_t U_EXPORT2
ucase_getTypeOrIgnorable(UChar32 c)590 ucase_getTypeOrIgnorable(UChar32 c) {
591     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
592     return UCASE_GET_TYPE_AND_IGNORABLE(props);
593 }
594 
595 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
596 static inline int32_t
getDotType(UChar32 c)597 getDotType(UChar32 c) {
598     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
599     if(!UCASE_HAS_EXCEPTION(props)) {
600         return props&UCASE_DOT_MASK;
601     } else {
602         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
603         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
604     }
605 }
606 
607 U_CAPI UBool U_EXPORT2
ucase_isSoftDotted(UChar32 c)608 ucase_isSoftDotted(UChar32 c) {
609     return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
610 }
611 
612 U_CAPI UBool U_EXPORT2
ucase_isCaseSensitive(UChar32 c)613 ucase_isCaseSensitive(UChar32 c) {
614     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
615     if(!UCASE_HAS_EXCEPTION(props)) {
616         return (UBool)((props&UCASE_SENSITIVE)!=0);
617     } else {
618         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
619         return (UBool)((*pe&UCASE_EXC_SENSITIVE)!=0);
620     }
621 }
622 
623 /* string casing ------------------------------------------------------------ */
624 
625 /*
626  * These internal functions form the core of string case mappings.
627  * They map single code points to result code points or strings and take
628  * all necessary conditions (context, locale ID, options) into account.
629  *
630  * They do not iterate over the source or write to the destination
631  * so that the same functions are useful for non-standard string storage,
632  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
633  * For the same reason, the "surrounding text" context is passed in as a
634  * UCaseContextIterator which does not make any assumptions about
635  * the underlying storage.
636  *
637  * This section contains helper functions that check for conditions
638  * in the input text surrounding the current code point
639  * according to SpecialCasing.txt.
640  *
641  * Each helper function gets the index
642  * - after the current code point if it looks at following text
643  * - before the current code point if it looks at preceding text
644  *
645  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
646  *
647  * Final_Sigma
648  *   C is preceded by a sequence consisting of
649  *     a cased letter and a case-ignorable sequence,
650  *   and C is not followed by a sequence consisting of
651  *     an ignorable sequence and then a cased letter.
652  *
653  * More_Above
654  *   C is followed by one or more characters of combining class 230 (ABOVE)
655  *   in the combining character sequence.
656  *
657  * After_Soft_Dotted
658  *   The last preceding character with combining class of zero before C
659  *   was Soft_Dotted,
660  *   and there is no intervening combining character class 230 (ABOVE).
661  *
662  * Before_Dot
663  *   C is followed by combining dot above (U+0307).
664  *   Any sequence of characters with a combining class that is neither 0 nor 230
665  *   may intervene between the current character and the combining dot above.
666  *
667  * The erratum from 2002-10-31 adds the condition
668  *
669  * After_I
670  *   The last preceding base character was an uppercase I, and there is no
671  *   intervening combining character class 230 (ABOVE).
672  *
673  *   (See Jitterbug 2344 and the comments on After_I below.)
674  *
675  * Helper definitions in Unicode 3.2 UAX 21:
676  *
677  * D1. A character C is defined to be cased
678  *     if it meets any of the following criteria:
679  *
680  *   - The general category of C is Titlecase Letter (Lt)
681  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
682  *   - Given D = NFD(C), then it is not the case that:
683  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
684  *     (This third criterium does not add any characters to the list
685  *      for Unicode 3.2. Ignored.)
686  *
687  * D2. A character C is defined to be case-ignorable
688  *     if it meets either of the following criteria:
689  *
690  *   - The general category of C is
691  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
692  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
693  *   - C is one of the following characters
694  *     U+0027 APOSTROPHE
695  *     U+00AD SOFT HYPHEN (SHY)
696  *     U+2019 RIGHT SINGLE QUOTATION MARK
697  *            (the preferred character for apostrophe)
698  *
699  * D3. A case-ignorable sequence is a sequence of
700  *     zero or more case-ignorable characters.
701  */
702 
703 #define is_d(c) ((c)=='d' || (c)=='D')
704 #define is_e(c) ((c)=='e' || (c)=='E')
705 #define is_i(c) ((c)=='i' || (c)=='I')
706 #define is_l(c) ((c)=='l' || (c)=='L')
707 #define is_r(c) ((c)=='r' || (c)=='R')
708 #define is_t(c) ((c)=='t' || (c)=='T')
709 #define is_u(c) ((c)=='u' || (c)=='U')
710 #define is_y(c) ((c)=='y' || (c)=='Y')
711 #define is_z(c) ((c)=='z' || (c)=='Z')
712 
713 /* separator? */
714 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
715 
716 /**
717  * Requires non-NULL locale ID but otherwise does the equivalent of
718  * checking for language codes as if uloc_getLanguage() were called:
719  * Accepts both 2- and 3-letter codes and accepts case variants.
720  */
721 U_CFUNC int32_t
ucase_getCaseLocale(const char * locale)722 ucase_getCaseLocale(const char *locale) {
723     /*
724      * This function used to use uloc_getLanguage(), but the current code
725      * removes the dependency of this low-level code on uloc implementation code
726      * and is faster because not the whole locale ID has to be
727      * examined and copied/transformed.
728      *
729      * Because this code does not want to depend on uloc, the caller must
730      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
731      */
732     char c=*locale++;
733     // Fastpath for English "en" which is often used for default (=root locale) case mappings,
734     // and for Chinese "zh": Very common but no special case mapping behavior.
735     // Then check lowercase vs. uppercase to reduce the number of comparisons
736     // for other locales without special behavior.
737     if(c=='e') {
738         /* el or ell? */
739         c=*locale++;
740         if(is_l(c)) {
741             c=*locale++;
742             if(is_l(c)) {
743                 c=*locale;
744             }
745             if(is_sep(c)) {
746                 return UCASE_LOC_GREEK;
747             }
748         }
749         // en, es, ... -> root
750     } else if(c=='z') {
751         return UCASE_LOC_ROOT;
752 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
753     } else if(c>='a') {  // ASCII a-z = 0x61..0x7a, after A-Z
754 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
755     } else if(c<='z') {  // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
756 #else
757 #   error Unknown charset family!
758 #endif
759         // lowercase c
760         if(c=='t') {
761             /* tr or tur? */
762             c=*locale++;
763             if(is_u(c)) {
764                 c=*locale++;
765             }
766             if(is_r(c)) {
767                 c=*locale;
768                 if(is_sep(c)) {
769                     return UCASE_LOC_TURKISH;
770                 }
771             }
772         } else if(c=='a') {
773             /* az or aze? */
774             c=*locale++;
775             if(is_z(c)) {
776                 c=*locale++;
777                 if(is_e(c)) {
778                     c=*locale;
779                 }
780                 if(is_sep(c)) {
781                     return UCASE_LOC_TURKISH;
782                 }
783             }
784         } else if(c=='l') {
785             /* lt or lit? */
786             c=*locale++;
787             if(is_i(c)) {
788                 c=*locale++;
789             }
790             if(is_t(c)) {
791                 c=*locale;
792                 if(is_sep(c)) {
793                     return UCASE_LOC_LITHUANIAN;
794                 }
795             }
796         } else if(c=='n') {
797             /* nl or nld? */
798             c=*locale++;
799             if(is_l(c)) {
800                 c=*locale++;
801                 if(is_d(c)) {
802                     c=*locale;
803                 }
804                 if(is_sep(c)) {
805                     return UCASE_LOC_DUTCH;
806                 }
807             }
808         } else if(c=='h') {
809             /* hy or hye? *not* hyw */
810             c=*locale++;
811             if(is_y(c)) {
812                 c=*locale++;
813                 if(is_e(c)) {
814                     c=*locale;
815                 }
816                 if(is_sep(c)) {
817                     return UCASE_LOC_ARMENIAN;
818                 }
819             }
820         }
821     } else {
822         // uppercase c
823         // Same code as for lowercase c but also check for 'E'.
824         if(c=='T') {
825             /* tr or tur? */
826             c=*locale++;
827             if(is_u(c)) {
828                 c=*locale++;
829             }
830             if(is_r(c)) {
831                 c=*locale;
832                 if(is_sep(c)) {
833                     return UCASE_LOC_TURKISH;
834                 }
835             }
836         } else if(c=='A') {
837             /* az or aze? */
838             c=*locale++;
839             if(is_z(c)) {
840                 c=*locale++;
841                 if(is_e(c)) {
842                     c=*locale;
843                 }
844                 if(is_sep(c)) {
845                     return UCASE_LOC_TURKISH;
846                 }
847             }
848         } else if(c=='L') {
849             /* lt or lit? */
850             c=*locale++;
851             if(is_i(c)) {
852                 c=*locale++;
853             }
854             if(is_t(c)) {
855                 c=*locale;
856                 if(is_sep(c)) {
857                     return UCASE_LOC_LITHUANIAN;
858                 }
859             }
860         } else if(c=='E') {
861             /* el or ell? */
862             c=*locale++;
863             if(is_l(c)) {
864                 c=*locale++;
865                 if(is_l(c)) {
866                     c=*locale;
867                 }
868                 if(is_sep(c)) {
869                     return UCASE_LOC_GREEK;
870                 }
871             }
872         } else if(c=='N') {
873             /* nl or nld? */
874             c=*locale++;
875             if(is_l(c)) {
876                 c=*locale++;
877                 if(is_d(c)) {
878                     c=*locale;
879                 }
880                 if(is_sep(c)) {
881                     return UCASE_LOC_DUTCH;
882                 }
883             }
884         } else if(c=='H') {
885             /* hy or hye? *not* hyw */
886             c=*locale++;
887             if(is_y(c)) {
888                 c=*locale++;
889                 if(is_e(c)) {
890                     c=*locale;
891                 }
892                 if(is_sep(c)) {
893                     return UCASE_LOC_ARMENIAN;
894                 }
895             }
896         }
897     }
898     return UCASE_LOC_ROOT;
899 }
900 
901 /*
902  * Is followed by
903  *   {case-ignorable}* cased
904  * ?
905  * (dir determines looking forward/backward)
906  * If a character is case-ignorable, it is skipped regardless of whether
907  * it is also cased or not.
908  */
909 static UBool
isFollowedByCasedLetter(UCaseContextIterator * iter,void * context,int8_t dir)910 isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
911     UChar32 c;
912 
913     if(iter==NULL) {
914         return FALSE;
915     }
916 
917     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
918         int32_t type=ucase_getTypeOrIgnorable(c);
919         if(type&4) {
920             /* case-ignorable, continue with the loop */
921         } else if(type!=UCASE_NONE) {
922             return TRUE; /* followed by cased letter */
923         } else {
924             return FALSE; /* uncased and not case-ignorable */
925         }
926     }
927 
928     return FALSE; /* not followed by cased letter */
929 }
930 
931 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
932 static UBool
isPrecededBySoftDotted(UCaseContextIterator * iter,void * context)933 isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
934     UChar32 c;
935     int32_t dotType;
936     int8_t dir;
937 
938     if(iter==NULL) {
939         return FALSE;
940     }
941 
942     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
943         dotType=getDotType(c);
944         if(dotType==UCASE_SOFT_DOTTED) {
945             return TRUE; /* preceded by TYPE_i */
946         } else if(dotType!=UCASE_OTHER_ACCENT) {
947             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
948         }
949     }
950 
951     return FALSE; /* not preceded by TYPE_i */
952 }
953 
954 /*
955  * See Jitterbug 2344:
956  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
957  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
958  * we made those releases compatible with Unicode 3.2 which had not fixed
959  * a related bug in SpecialCasing.txt.
960  *
961  * From the Jitterbug 2344 text:
962  * ... this bug is listed as a Unicode erratum
963  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
964  * <quote>
965  * There are two errors in SpecialCasing.txt.
966  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
967  * 2. An incorrect context definition. Correct as follows:
968  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
969  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
970  * ---
971  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
972  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
973  * where the context After_I is defined as:
974  * The last preceding base character was an uppercase I, and there is no
975  * intervening combining character class 230 (ABOVE).
976  * </quote>
977  *
978  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
979  *
980  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
981  * # This matches the behavior of the canonically equivalent I-dot_above
982  *
983  * See also the description in this place in older versions of uchar.c (revision 1.100).
984  *
985  * Markus W. Scherer 2003-feb-15
986  */
987 
988 /* Is preceded by base character 'I' with no intervening cc=230 ? */
989 static UBool
isPrecededBy_I(UCaseContextIterator * iter,void * context)990 isPrecededBy_I(UCaseContextIterator *iter, void *context) {
991     UChar32 c;
992     int32_t dotType;
993     int8_t dir;
994 
995     if(iter==NULL) {
996         return FALSE;
997     }
998 
999     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
1000         if(c==0x49) {
1001             return TRUE; /* preceded by I */
1002         }
1003         dotType=getDotType(c);
1004         if(dotType!=UCASE_OTHER_ACCENT) {
1005             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
1006         }
1007     }
1008 
1009     return FALSE; /* not preceded by I */
1010 }
1011 
1012 /* Is followed by one or more cc==230 ? */
1013 static UBool
isFollowedByMoreAbove(UCaseContextIterator * iter,void * context)1014 isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
1015     UChar32 c;
1016     int32_t dotType;
1017     int8_t dir;
1018 
1019     if(iter==NULL) {
1020         return FALSE;
1021     }
1022 
1023     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1024         dotType=getDotType(c);
1025         if(dotType==UCASE_ABOVE) {
1026             return TRUE; /* at least one cc==230 following */
1027         } else if(dotType!=UCASE_OTHER_ACCENT) {
1028             return FALSE; /* next base character, no more cc==230 following */
1029         }
1030     }
1031 
1032     return FALSE; /* no more cc==230 following */
1033 }
1034 
1035 /* Is followed by a dot above (without cc==230 in between) ? */
1036 static UBool
isFollowedByDotAbove(UCaseContextIterator * iter,void * context)1037 isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
1038     UChar32 c;
1039     int32_t dotType;
1040     int8_t dir;
1041 
1042     if(iter==NULL) {
1043         return FALSE;
1044     }
1045 
1046     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
1047         if(c==0x307) {
1048             return TRUE;
1049         }
1050         dotType=getDotType(c);
1051         if(dotType!=UCASE_OTHER_ACCENT) {
1052             return FALSE; /* next base character or cc==230 in between */
1053         }
1054     }
1055 
1056     return FALSE; /* no dot above following */
1057 }
1058 
1059 U_CAPI int32_t U_EXPORT2
ucase_toFullLower(UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,int32_t loc)1060 ucase_toFullLower(UChar32 c,
1061                   UCaseContextIterator *iter, void *context,
1062                   const UChar **pString,
1063                   int32_t loc) {
1064     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1065     U_ASSERT(c >= 0);
1066     UChar32 result=c;
1067     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1068     if(!UCASE_HAS_EXCEPTION(props)) {
1069         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1070             result=c+UCASE_GET_DELTA(props);
1071         }
1072     } else {
1073         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1074         uint16_t excWord=*pe++;
1075         int32_t full;
1076 
1077         pe2=pe;
1078 
1079         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1080             /* use hardcoded conditions and mappings */
1081 
1082             /*
1083              * Test for conditional mappings first
1084              *   (otherwise the unconditional default mappings are always taken),
1085              * then test for characters that have unconditional mappings in SpecialCasing.txt,
1086              * then get the UnicodeData.txt mappings.
1087              */
1088             if( loc==UCASE_LOC_LITHUANIAN &&
1089                     /* base characters, find accents above */
1090                     (((c==0x49 || c==0x4a || c==0x12e) &&
1091                         isFollowedByMoreAbove(iter, context)) ||
1092                     /* precomposed with accent above, no need to find one */
1093                     (c==0xcc || c==0xcd || c==0x128))
1094             ) {
1095                 /*
1096                     # Lithuanian
1097 
1098                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1099 
1100                     # Introduce an explicit dot above when lowercasing capital I's and J's
1101                     # whenever there are more accents above.
1102                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
1103 
1104                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
1105                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
1106                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
1107                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
1108                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
1109                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
1110                  */
1111                 switch(c) {
1112                 case 0x49:  /* LATIN CAPITAL LETTER I */
1113                     *pString=iDot;
1114                     return 2;
1115                 case 0x4a:  /* LATIN CAPITAL LETTER J */
1116                     *pString=jDot;
1117                     return 2;
1118                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
1119                     *pString=iOgonekDot;
1120                     return 2;
1121                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
1122                     *pString=iDotGrave;
1123                     return 3;
1124                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
1125                     *pString=iDotAcute;
1126                     return 3;
1127                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
1128                     *pString=iDotTilde;
1129                     return 3;
1130                 default:
1131                     return 0; /* will not occur */
1132                 }
1133             /* # Turkish and Azeri */
1134             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
1135                 /*
1136                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1137                     # The following rules handle those cases.
1138 
1139                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
1140                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
1141                  */
1142                 return 0x69;
1143             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
1144                 /*
1145                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
1146                     # This matches the behavior of the canonically equivalent I-dot_above
1147 
1148                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
1149                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
1150                  */
1151                 *pString=nullptr;
1152                 return 0; /* remove the dot (continue without output) */
1153             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
1154                 /*
1155                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
1156 
1157                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
1158                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
1159                  */
1160                 return 0x131;
1161             } else if(c==0x130) {
1162                 /*
1163                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
1164 
1165                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1166                  */
1167                 *pString=iDot;
1168                 return 2;
1169             } else if(  c==0x3a3 &&
1170                         !isFollowedByCasedLetter(iter, context, 1) &&
1171                         isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
1172             ) {
1173                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
1174                 /*
1175                     # Special case for final form of sigma
1176 
1177                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
1178                  */
1179                 return 0x3c2; /* greek small final sigma */
1180             } else {
1181                 /* no known conditional special case mapping, use a normal mapping */
1182             }
1183         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1184             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1185             full&=UCASE_FULL_LOWER;
1186             if(full!=0) {
1187                 /* set the output pointer to the lowercase mapping */
1188                 *pString=reinterpret_cast<const UChar *>(pe+1);
1189 
1190                 /* return the string length */
1191                 return full;
1192             }
1193         }
1194 
1195         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1196             int32_t delta;
1197             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1198             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1199         }
1200         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1201             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
1202         }
1203     }
1204 
1205     return (result==c) ? ~result : result;
1206 }
1207 
1208 /* internal */
1209 static int32_t
toUpperOrTitle(UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,int32_t loc,UBool upperNotTitle)1210 toUpperOrTitle(UChar32 c,
1211                UCaseContextIterator *iter, void *context,
1212                const UChar **pString,
1213                int32_t loc,
1214                UBool upperNotTitle) {
1215     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1216     U_ASSERT(c >= 0);
1217     UChar32 result=c;
1218     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1219     if(!UCASE_HAS_EXCEPTION(props)) {
1220         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
1221             result=c+UCASE_GET_DELTA(props);
1222         }
1223     } else {
1224         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1225         uint16_t excWord=*pe++;
1226         int32_t full, idx;
1227 
1228         pe2=pe;
1229 
1230         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
1231             /* use hardcoded conditions and mappings */
1232             if(loc==UCASE_LOC_TURKISH && c==0x69) {
1233                 /*
1234                     # Turkish and Azeri
1235 
1236                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
1237                     # The following rules handle those cases.
1238 
1239                     # When uppercasing, i turns into a dotted capital I
1240 
1241                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
1242                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
1243                 */
1244                 return 0x130;
1245             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
1246                 /*
1247                     # Lithuanian
1248 
1249                     # Lithuanian retains the dot in a lowercase i when followed by accents.
1250 
1251                     # Remove DOT ABOVE after "i" with upper or titlecase
1252 
1253                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
1254                  */
1255                 *pString=nullptr;
1256                 return 0; /* remove the dot (continue without output) */
1257             } else if(c==0x0587) {
1258                 // See ICU-13416:
1259                 // և ligature ech-yiwn
1260                 // uppercases to ԵՒ=ech+yiwn by default and in Western Armenian,
1261                 // but to ԵՎ=ech+vew in Eastern Armenian.
1262                 if(loc==UCASE_LOC_ARMENIAN) {
1263                     *pString=upperNotTitle ? u"ԵՎ" : u"Եվ";
1264                 } else {
1265                     *pString=upperNotTitle ? u"ԵՒ" : u"Եւ";
1266                 }
1267                 return 2;
1268             } else {
1269                 /* no known conditional special case mapping, use a normal mapping */
1270             }
1271         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1272             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1273 
1274             /* start of full case mapping strings */
1275             ++pe;
1276 
1277             /* skip the lowercase and case-folding result strings */
1278             pe+=full&UCASE_FULL_LOWER;
1279             full>>=4;
1280             pe+=full&0xf;
1281             full>>=4;
1282 
1283             if(upperNotTitle) {
1284                 full&=0xf;
1285             } else {
1286                 /* skip the uppercase result string */
1287                 pe+=full&0xf;
1288                 full=(full>>4)&0xf;
1289             }
1290 
1291             if(full!=0) {
1292                 /* set the output pointer to the result string */
1293                 *pString=reinterpret_cast<const UChar *>(pe);
1294 
1295                 /* return the string length */
1296                 return full;
1297             }
1298         }
1299 
1300         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_GET_TYPE(props)==UCASE_LOWER) {
1301             int32_t delta;
1302             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1303             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1304         }
1305         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
1306             idx=UCASE_EXC_TITLE;
1307         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
1308             /* here, titlecase is same as uppercase */
1309             idx=UCASE_EXC_UPPER;
1310         } else {
1311             return ~c;
1312         }
1313         GET_SLOT_VALUE(excWord, idx, pe2, result);
1314     }
1315 
1316     return (result==c) ? ~result : result;
1317 }
1318 
1319 U_CAPI int32_t U_EXPORT2
ucase_toFullUpper(UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,int32_t caseLocale)1320 ucase_toFullUpper(UChar32 c,
1321                   UCaseContextIterator *iter, void *context,
1322                   const UChar **pString,
1323                   int32_t caseLocale) {
1324     return toUpperOrTitle(c, iter, context, pString, caseLocale, TRUE);
1325 }
1326 
1327 U_CAPI int32_t U_EXPORT2
ucase_toFullTitle(UChar32 c,UCaseContextIterator * iter,void * context,const UChar ** pString,int32_t caseLocale)1328 ucase_toFullTitle(UChar32 c,
1329                   UCaseContextIterator *iter, void *context,
1330                   const UChar **pString,
1331                   int32_t caseLocale) {
1332     return toUpperOrTitle(c, iter, context, pString, caseLocale, FALSE);
1333 }
1334 
1335 /* case folding ------------------------------------------------------------- */
1336 
1337 /*
1338  * Case folding is similar to lowercasing.
1339  * The result may be a simple mapping, i.e., a single code point, or
1340  * a full mapping, i.e., a string.
1341  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
1342  * then only the lowercase mapping is stored.
1343  *
1344  * Some special cases are hardcoded because their conditions cannot be
1345  * parsed and processed from CaseFolding.txt.
1346  *
1347  * Unicode 3.2 CaseFolding.txt specifies for its status field:
1348 
1349 # C: common case folding, common mappings shared by both simple and full mappings.
1350 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
1351 # S: simple case folding, mappings to single characters where different from F.
1352 # T: special case for uppercase I and dotted uppercase I
1353 #    - For non-Turkic languages, this mapping is normally not used.
1354 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
1355 #
1356 # Usage:
1357 #  A. To do a simple case folding, use the mappings with status C + S.
1358 #  B. To do a full case folding, use the mappings with status C + F.
1359 #
1360 #    The mappings with status T can be used or omitted depending on the desired case-folding
1361 #    behavior. (The default option is to exclude them.)
1362 
1363  * Unicode 3.2 has 'T' mappings as follows:
1364 
1365 0049; T; 0131; # LATIN CAPITAL LETTER I
1366 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1367 
1368  * while the default mappings for these code points are:
1369 
1370 0049; C; 0069; # LATIN CAPITAL LETTER I
1371 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
1372 
1373  * U+0130 has no simple case folding (simple-case-folds to itself).
1374  */
1375 
1376 /* return the simple case folding mapping for c */
1377 U_CAPI UChar32 U_EXPORT2
ucase_fold(UChar32 c,uint32_t options)1378 ucase_fold(UChar32 c, uint32_t options) {
1379     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1380     if(!UCASE_HAS_EXCEPTION(props)) {
1381         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1382             c+=UCASE_GET_DELTA(props);
1383         }
1384     } else {
1385         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
1386         uint16_t excWord=*pe++;
1387         int32_t idx;
1388         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1389             /* special case folding mappings, hardcoded */
1390             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1391                 /* default mappings */
1392                 if(c==0x49) {
1393                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1394                     return 0x69;
1395                 } else if(c==0x130) {
1396                     /* no simple case folding for U+0130 */
1397                     return c;
1398                 }
1399             } else {
1400                 /* Turkic mappings */
1401                 if(c==0x49) {
1402                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1403                     return 0x131;
1404                 } else if(c==0x130) {
1405                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1406                     return 0x69;
1407                 }
1408             }
1409         }
1410         if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1411             return c;
1412         }
1413         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1414             int32_t delta;
1415             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe, delta);
1416             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1417         }
1418         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1419             idx=UCASE_EXC_FOLD;
1420         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1421             idx=UCASE_EXC_LOWER;
1422         } else {
1423             return c;
1424         }
1425         GET_SLOT_VALUE(excWord, idx, pe, c);
1426     }
1427     return c;
1428 }
1429 
1430 /*
1431  * Issue for canonical caseless match (UAX #21):
1432  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
1433  * canonical equivalence, unlike default-option casefolding.
1434  * For example, I-grave and I + grave fold to strings that are not canonically
1435  * equivalent.
1436  * For more details, see the comment in unorm_compare() in unorm.cpp
1437  * and the intermediate prototype changes for Jitterbug 2021.
1438  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
1439  *
1440  * This did not get fixed because it appears that it is not possible to fix
1441  * it for uppercase and lowercase characters (I-grave vs. i-grave)
1442  * together in a way that they still fold to common result strings.
1443  */
1444 
1445 U_CAPI int32_t U_EXPORT2
ucase_toFullFolding(UChar32 c,const UChar ** pString,uint32_t options)1446 ucase_toFullFolding(UChar32 c,
1447                     const UChar **pString,
1448                     uint32_t options) {
1449     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
1450     U_ASSERT(c >= 0);
1451     UChar32 result=c;
1452     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
1453     if(!UCASE_HAS_EXCEPTION(props)) {
1454         if(UCASE_IS_UPPER_OR_TITLE(props)) {
1455             result=c+UCASE_GET_DELTA(props);
1456         }
1457     } else {
1458         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
1459         uint16_t excWord=*pe++;
1460         int32_t full, idx;
1461 
1462         pe2=pe;
1463 
1464         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
1465             /* use hardcoded conditions and mappings */
1466             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
1467                 /* default mappings */
1468                 if(c==0x49) {
1469                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
1470                     return 0x69;
1471                 } else if(c==0x130) {
1472                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1473                     *pString=iDot;
1474                     return 2;
1475                 }
1476             } else {
1477                 /* Turkic mappings */
1478                 if(c==0x49) {
1479                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
1480                     return 0x131;
1481                 } else if(c==0x130) {
1482                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
1483                     return 0x69;
1484                 }
1485             }
1486         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
1487             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
1488 
1489             /* start of full case mapping strings */
1490             ++pe;
1491 
1492             /* skip the lowercase result string */
1493             pe+=full&UCASE_FULL_LOWER;
1494             full=(full>>4)&0xf;
1495 
1496             if(full!=0) {
1497                 /* set the output pointer to the result string */
1498                 *pString=reinterpret_cast<const UChar *>(pe);
1499 
1500                 /* return the string length */
1501                 return full;
1502             }
1503         }
1504 
1505         if((excWord&UCASE_EXC_NO_SIMPLE_CASE_FOLDING)!=0) {
1506             return ~c;
1507         }
1508         if(HAS_SLOT(excWord, UCASE_EXC_DELTA) && UCASE_IS_UPPER_OR_TITLE(props)) {
1509             int32_t delta;
1510             GET_SLOT_VALUE(excWord, UCASE_EXC_DELTA, pe2, delta);
1511             return (excWord&UCASE_EXC_DELTA_IS_NEGATIVE)==0 ? c+delta : c-delta;
1512         }
1513         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
1514             idx=UCASE_EXC_FOLD;
1515         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
1516             idx=UCASE_EXC_LOWER;
1517         } else {
1518             return ~c;
1519         }
1520         GET_SLOT_VALUE(excWord, idx, pe2, result);
1521     }
1522 
1523     return (result==c) ? ~result : result;
1524 }
1525 
1526 /* case mapping properties API ---------------------------------------------- */
1527 
1528 /* public API (see uchar.h) */
1529 
1530 U_CAPI UBool U_EXPORT2
u_isULowercase(UChar32 c)1531 u_isULowercase(UChar32 c) {
1532     return (UBool)(UCASE_LOWER==ucase_getType(c));
1533 }
1534 
1535 U_CAPI UBool U_EXPORT2
u_isUUppercase(UChar32 c)1536 u_isUUppercase(UChar32 c) {
1537     return (UBool)(UCASE_UPPER==ucase_getType(c));
1538 }
1539 
1540 /* Transforms the Unicode character to its lower case equivalent.*/
1541 U_CAPI UChar32 U_EXPORT2
u_tolower(UChar32 c)1542 u_tolower(UChar32 c) {
1543     return ucase_tolower(c);
1544 }
1545 
1546 /* Transforms the Unicode character to its upper case equivalent.*/
1547 U_CAPI UChar32 U_EXPORT2
u_toupper(UChar32 c)1548 u_toupper(UChar32 c) {
1549     return ucase_toupper(c);
1550 }
1551 
1552 /* Transforms the Unicode character to its title case equivalent.*/
1553 U_CAPI UChar32 U_EXPORT2
u_totitle(UChar32 c)1554 u_totitle(UChar32 c) {
1555     return ucase_totitle(c);
1556 }
1557 
1558 /* return the simple case folding mapping for c */
1559 U_CAPI UChar32 U_EXPORT2
u_foldCase(UChar32 c,uint32_t options)1560 u_foldCase(UChar32 c, uint32_t options) {
1561     return ucase_fold(c, options);
1562 }
1563 
1564 U_CFUNC int32_t U_EXPORT2
ucase_hasBinaryProperty(UChar32 c,UProperty which)1565 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
1566     /* case mapping properties */
1567     const UChar *resultString;
1568     switch(which) {
1569     case UCHAR_LOWERCASE:
1570         return (UBool)(UCASE_LOWER==ucase_getType(c));
1571     case UCHAR_UPPERCASE:
1572         return (UBool)(UCASE_UPPER==ucase_getType(c));
1573     case UCHAR_SOFT_DOTTED:
1574         return ucase_isSoftDotted(c);
1575     case UCHAR_CASE_SENSITIVE:
1576         return ucase_isCaseSensitive(c);
1577     case UCHAR_CASED:
1578         return (UBool)(UCASE_NONE!=ucase_getType(c));
1579     case UCHAR_CASE_IGNORABLE:
1580         return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
1581     /*
1582      * Note: The following Changes_When_Xyz are defined as testing whether
1583      * the NFD form of the input changes when Xyz-case-mapped.
1584      * However, this simpler implementation of these properties,
1585      * ignoring NFD, passes the tests.
1586      * The implementation needs to be changed if the tests start failing.
1587      * When that happens, optimizations should be used to work with the
1588      * per-single-code point ucase_toFullXyz() functions unless
1589      * the NFD form has more than one code point,
1590      * and the property starts set needs to be the union of the
1591      * start sets for normalization and case mappings.
1592      */
1593     case UCHAR_CHANGES_WHEN_LOWERCASED:
1594         return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1595     case UCHAR_CHANGES_WHEN_UPPERCASED:
1596         return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1597     case UCHAR_CHANGES_WHEN_TITLECASED:
1598         return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1599     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
1600     case UCHAR_CHANGES_WHEN_CASEMAPPED:
1601         return (UBool)(
1602             ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1603             ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
1604             ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
1605     default:
1606         return FALSE;
1607     }
1608 }
1609