1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // ucasemap_imp.h
5 // created: 2017feb08 Markus W. Scherer
6 
7 #ifndef __UCASEMAP_IMP_H__
8 #define __UCASEMAP_IMP_H__
9 
10 #include "unicode/utypes.h"
11 #include "unicode/ucasemap.h"
12 #include "unicode/uchar.h"
13 #include "ucase.h"
14 
15 /**
16  * Bit mask for the titlecasing iterator options bit field.
17  * Currently only 3 out of 8 values are used:
18  * 0 (words), U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
19  * See stringoptions.h.
20  * @internal
21  */
22 #define U_TITLECASE_ITERATOR_MASK 0xe0
23 
24 /**
25  * Bit mask for the titlecasing index adjustment options bit set.
26  * Currently two bits are defined:
27  * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED.
28  * See stringoptions.h.
29  * @internal
30  */
31 #define U_TITLECASE_ADJUSTMENT_MASK 0x600
32 
33 /**
34  * Internal API, used by u_strcasecmp() etc.
35  * Compare strings case-insensitively,
36  * in code point order or code unit order.
37  */
38 U_CFUNC int32_t
39 u_strcmpFold(const UChar *s1, int32_t length1,
40              const UChar *s2, int32_t length2,
41              uint32_t options,
42              UErrorCode *pErrorCode);
43 
44 /**
45  * Internal API, used for detecting length of
46  * shared prefix case-insensitively.
47  * @param s1            input string 1
48  * @param length1       length of string 1, or -1 (NULL terminated)
49  * @param s2            input string 2
50  * @param length2       length of string 2, or -1 (NULL terminated)
51  * @param options       compare options
52  * @param matchLen1     (output) length of partial prefix match in s1
53  * @param matchLen2     (output) length of partial prefix match in s2
54  * @param pErrorCode    receives error status
55  */
56 U_CAPI void
57 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
58                              const UChar *s2, int32_t length2,
59                              uint32_t options,
60                              int32_t *matchLen1, int32_t *matchLen2,
61                              UErrorCode *pErrorCode);
62 
63 #ifdef __cplusplus
64 
65 U_NAMESPACE_BEGIN
66 
67 class BreakIterator;        // unicode/brkiter.h
68 class ByteSink;
69 class Locale;               // unicode/locid.h
70 
71 /** Returns TRUE if the options are valid. Otherwise FALSE, and sets an error. */
ustrcase_checkTitleAdjustmentOptions(uint32_t options,UErrorCode & errorCode)72 inline UBool ustrcase_checkTitleAdjustmentOptions(uint32_t options, UErrorCode &errorCode) {
73     if (U_FAILURE(errorCode)) { return FALSE; }
74     if ((options & U_TITLECASE_ADJUSTMENT_MASK) == U_TITLECASE_ADJUSTMENT_MASK) {
75         // Both options together.
76         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
77         return FALSE;
78     }
79     return TRUE;
80 }
81 
ustrcase_isLNS(UChar32 c)82 inline UBool ustrcase_isLNS(UChar32 c) {
83     // Letter, number, symbol,
84     // or a private use code point because those are typically used as letters or numbers.
85     // Consider modifier letters only if they are cased.
86     const uint32_t LNS = (U_GC_L_MASK|U_GC_N_MASK|U_GC_S_MASK|U_GC_CO_MASK) & ~U_GC_LM_MASK;
87     int gc = u_charType(c);
88     return (U_MASK(gc) & LNS) != 0 || (gc == U_MODIFIER_LETTER && ucase_getType(c) != UCASE_NONE);
89 }
90 
91 #if !UCONFIG_NO_BREAK_ITERATION
92 
93 /** Returns nullptr if error. Pass in either locale or locID, not both. */
94 U_CFUNC
95 BreakIterator *ustrcase_getTitleBreakIterator(
96         const Locale *locale, const char *locID, uint32_t options, BreakIterator *iter,
97         LocalPointer<BreakIterator> &ownedIter, UErrorCode &errorCode);
98 
99 #endif
100 
101 U_NAMESPACE_END
102 
103 #include "unicode/unistr.h"  // for UStringCaseMapper
104 
105 /*
106  * Internal string casing functions implementing
107  * ustring.h/ustrcase.cpp and UnicodeString case mapping functions.
108  */
109 
110 struct UCaseMap : public icu::UMemory {
111     /** Implements most of ucasemap_open(). */
112     UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode);
113     ~UCaseMap();
114 
115 #if !UCONFIG_NO_BREAK_ITERATION
116     icu::BreakIterator *iter;  /* We adopt the iterator, so we own it. */
117 #endif
118     char locale[32];
119     int32_t caseLocale;
120     uint32_t options;
121 };
122 
123 #if UCONFIG_NO_BREAK_ITERATION
124 #   define UCASEMAP_BREAK_ITERATOR_PARAM
125 #   define UCASEMAP_BREAK_ITERATOR_UNUSED
126 #   define UCASEMAP_BREAK_ITERATOR
127 #   define UCASEMAP_BREAK_ITERATOR_NULL
128 #else
129 #   define UCASEMAP_BREAK_ITERATOR_PARAM icu::BreakIterator *iter,
130 #   define UCASEMAP_BREAK_ITERATOR_UNUSED icu::BreakIterator *,
131 #   define UCASEMAP_BREAK_ITERATOR iter,
132 #   define UCASEMAP_BREAK_ITERATOR_NULL NULL,
133 #endif
134 
135 U_CFUNC int32_t
136 ustrcase_getCaseLocale(const char *locale);
137 
138 // TODO: swap src / dest if approved for new public api
139 /** Implements UStringCaseMapper. */
140 U_CFUNC int32_t U_CALLCONV
141 ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
142                          UChar *dest, int32_t destCapacity,
143                          const UChar *src, int32_t srcLength,
144                          icu::Edits *edits,
145                          UErrorCode &errorCode);
146 
147 /** Implements UStringCaseMapper. */
148 U_CFUNC int32_t U_CALLCONV
149 ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
150                          UChar *dest, int32_t destCapacity,
151                          const UChar *src, int32_t srcLength,
152                          icu::Edits *edits,
153                          UErrorCode &errorCode);
154 
155 #if !UCONFIG_NO_BREAK_ITERATION
156 
157 /** Implements UStringCaseMapper. */
158 U_CFUNC int32_t U_CALLCONV
159 ustrcase_internalToTitle(int32_t caseLocale, uint32_t options,
160                          icu::BreakIterator *iter,
161                          UChar *dest, int32_t destCapacity,
162                          const UChar *src, int32_t srcLength,
163                          icu::Edits *edits,
164                          UErrorCode &errorCode);
165 
166 #endif
167 
168 /** Implements UStringCaseMapper. */
169 U_CFUNC int32_t U_CALLCONV
170 ustrcase_internalFold(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
171                       UChar *dest, int32_t destCapacity,
172                       const UChar *src, int32_t srcLength,
173                       icu::Edits *edits,
174                       UErrorCode &errorCode);
175 
176 /**
177  * Common string case mapping implementation for ucasemap_toXyz() and UnicodeString::toXyz().
178  * Implements argument checking.
179  */
180 U_CFUNC int32_t
181 ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
182              UChar *dest, int32_t destCapacity,
183              const UChar *src, int32_t srcLength,
184              UStringCaseMapper *stringCaseMapper,
185              icu::Edits *edits,
186              UErrorCode &errorCode);
187 
188 /**
189  * Common string case mapping implementation for old-fashioned u_strToXyz() functions
190  * that allow the source string to overlap the destination buffer.
191  * Implements argument checking and internally works with an intermediate buffer if necessary.
192  */
193 U_CFUNC int32_t
194 ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
195                         UChar *dest, int32_t destCapacity,
196                         const UChar *src, int32_t srcLength,
197                         UStringCaseMapper *stringCaseMapper,
198                         UErrorCode &errorCode);
199 
200 /**
201  * UTF-8 string case mapping function type, used by ucasemap_mapUTF8().
202  * UTF-8 version of UStringCaseMapper.
203  * All error checking must be done.
204  * The UCaseMap must be fully initialized, with locale and/or iter set as needed.
205  */
206 typedef void U_CALLCONV
207 UTF8CaseMapper(int32_t caseLocale, uint32_t options,
208 #if !UCONFIG_NO_BREAK_ITERATION
209                icu::BreakIterator *iter,
210 #endif
211                const uint8_t *src, int32_t srcLength,
212                icu::ByteSink &sink, icu::Edits *edits,
213                UErrorCode &errorCode);
214 
215 #if !UCONFIG_NO_BREAK_ITERATION
216 
217 /** Implements UTF8CaseMapper. */
218 U_CFUNC void U_CALLCONV
219 ucasemap_internalUTF8ToTitle(int32_t caseLocale, uint32_t options,
220         icu::BreakIterator *iter,
221         const uint8_t *src, int32_t srcLength,
222         icu::ByteSink &sink, icu::Edits *edits,
223         UErrorCode &errorCode);
224 
225 #endif
226 
227 void
228 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
229                  const char *src, int32_t srcLength,
230                  UTF8CaseMapper *stringCaseMapper,
231                  icu::ByteSink &sink, icu::Edits *edits,
232                  UErrorCode &errorCode);
233 
234 /**
235  * Implements argument checking and buffer handling
236  * for UTF-8 string case mapping as a common function.
237  */
238 int32_t
239 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
240                  char *dest, int32_t destCapacity,
241                  const char *src, int32_t srcLength,
242                  UTF8CaseMapper *stringCaseMapper,
243                  icu::Edits *edits,
244                  UErrorCode &errorCode);
245 
246 U_NAMESPACE_BEGIN
247 namespace GreekUpper {
248 
249 // Data bits.
250 static const uint32_t UPPER_MASK = 0x3ff;
251 static const uint32_t HAS_VOWEL = 0x1000;
252 static const uint32_t HAS_YPOGEGRAMMENI = 0x2000;
253 static const uint32_t HAS_ACCENT = 0x4000;
254 static const uint32_t HAS_DIALYTIKA = 0x8000;
255 // Further bits during data building and processing, not stored in the data map.
256 static const uint32_t HAS_COMBINING_DIALYTIKA = 0x10000;
257 static const uint32_t HAS_OTHER_GREEK_DIACRITIC = 0x20000;
258 
259 static const uint32_t HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT;
260 static const uint32_t HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA =
261         HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA;
262 static const uint32_t HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA;
263 
264 // State bits.
265 static const uint32_t AFTER_CASED = 1;
266 static const uint32_t AFTER_VOWEL_WITH_ACCENT = 2;
267 
268 uint32_t getLetterData(UChar32 c);
269 
270 /**
271  * Returns a non-zero value for each of the Greek combining diacritics
272  * listed in The Unicode Standard, version 8, chapter 7.2 Greek,
273  * plus some perispomeni look-alikes.
274  */
275 uint32_t getDiacriticData(UChar32 c);
276 
277 }  // namespace GreekUpper
278 U_NAMESPACE_END
279 
280 #endif  // __cplusplus
281 
282 #endif  // __UCASEMAP_IMP_H__
283