1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // ucasemap_imp.h
5 // created: 2017feb08 Markus W. Scherer
6 
7 #ifndef __UCASEMAP_IMP_H__
8 #define __UCASEMAP_IMP_H__
9 
10 #include "unicode/utypes.h"
11 #include "unicode/ucasemap.h"
12 #include "unicode/uchar.h"
13 #include "ucase.h"
14 
15 /**
16  * Bit mask for the titlecasing iterator options bit field.
17  * Currently only 3 out of 8 values are used:
18  * 0 (words), U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
19  * See stringoptions.h.
20  * @internal
21  */
22 #define U_TITLECASE_ITERATOR_MASK 0xe0
23 
24 /**
25  * Bit mask for the titlecasing index adjustment options bit set.
26  * Currently two bits are defined:
27  * U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED.
28  * See stringoptions.h.
29  * @internal
30  */
31 #define U_TITLECASE_ADJUSTMENT_MASK 0x600
32 
33 /**
34  * Internal API, used by u_strcasecmp() etc.
35  * Compare strings case-insensitively,
36  * in code point order or code unit order.
37  */
38 U_CFUNC int32_t
39 u_strcmpFold(const UChar *s1, int32_t length1,
40              const UChar *s2, int32_t length2,
41              uint32_t options,
42              UErrorCode *pErrorCode);
43 
44 /**
45  * Internal API, used for detecting length of
46  * shared prefix case-insensitively.
47  * @param s1            input string 1
48  * @param length1       length of string 1, or -1 (NULL terminated)
49  * @param s2            input string 2
50  * @param length2       length of string 2, or -1 (NULL terminated)
51  * @param options       compare options
52  * @param matchLen1     (output) length of partial prefix match in s1
53  * @param matchLen2     (output) length of partial prefix match in s2
54  * @param pErrorCode    receives error status
55  */
56 U_CAPI void
57 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
58                              const UChar *s2, int32_t length2,
59                              uint32_t options,
60                              int32_t *matchLen1, int32_t *matchLen2,
61                              UErrorCode *pErrorCode);
62 
63 /**
64  * Are the Unicode properties loaded?
65  * This must be used before internal functions are called that do
66  * not perform this check.
67  * Generate a debug assertion failure if data is not loaded.
68  */
69 U_CFUNC UBool
70 uprv_haveProperties(UErrorCode *pErrorCode);
71 
72 #ifdef __cplusplus
73 
74 U_NAMESPACE_BEGIN
75 
76 class BreakIterator;        // unicode/brkiter.h
77 class ByteSink;
78 class Locale;               // unicode/locid.h
79 
80 /** Returns TRUE if the options are valid. Otherwise FALSE, and sets an error. */
ustrcase_checkTitleAdjustmentOptions(uint32_t options,UErrorCode & errorCode)81 inline UBool ustrcase_checkTitleAdjustmentOptions(uint32_t options, UErrorCode &errorCode) {
82     if (U_FAILURE(errorCode)) { return FALSE; }
83     if ((options & U_TITLECASE_ADJUSTMENT_MASK) == U_TITLECASE_ADJUSTMENT_MASK) {
84         // Both options together.
85         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
86         return FALSE;
87     }
88     return TRUE;
89 }
90 
ustrcase_isLNS(UChar32 c)91 inline UBool ustrcase_isLNS(UChar32 c) {
92     // Letter, number, symbol,
93     // or a private use code point because those are typically used as letters or numbers.
94     // Consider modifier letters only if they are cased.
95     const uint32_t LNS = (U_GC_L_MASK|U_GC_N_MASK|U_GC_S_MASK|U_GC_CO_MASK) & ~U_GC_LM_MASK;
96     int gc = u_charType(c);
97     return (U_MASK(gc) & LNS) != 0 || (gc == U_MODIFIER_LETTER && ucase_getType(c) != UCASE_NONE);
98 }
99 
100 #if !UCONFIG_NO_BREAK_ITERATION
101 
102 /** Returns nullptr if error. Pass in either locale or locID, not both. */
103 U_CFUNC
104 BreakIterator *ustrcase_getTitleBreakIterator(
105         const Locale *locale, const char *locID, uint32_t options, BreakIterator *iter,
106         LocalPointer<BreakIterator> &ownedIter, UErrorCode &errorCode);
107 
108 #endif
109 
110 U_NAMESPACE_END
111 
112 #include "unicode/unistr.h"  // for UStringCaseMapper
113 
114 /*
115  * Internal string casing functions implementing
116  * ustring.h/ustrcase.cpp and UnicodeString case mapping functions.
117  */
118 
119 struct UCaseMap : public icu::UMemory {
120     /** Implements most of ucasemap_open(). */
121     UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode);
122     ~UCaseMap();
123 
124 #if !UCONFIG_NO_BREAK_ITERATION
125     icu::BreakIterator *iter;  /* We adopt the iterator, so we own it. */
126 #endif
127     char locale[32];
128     int32_t caseLocale;
129     uint32_t options;
130 };
131 
132 #if UCONFIG_NO_BREAK_ITERATION
133 #   define UCASEMAP_BREAK_ITERATOR_PARAM
134 #   define UCASEMAP_BREAK_ITERATOR_UNUSED
135 #   define UCASEMAP_BREAK_ITERATOR
136 #   define UCASEMAP_BREAK_ITERATOR_NULL
137 #else
138 #   define UCASEMAP_BREAK_ITERATOR_PARAM icu::BreakIterator *iter,
139 #   define UCASEMAP_BREAK_ITERATOR_UNUSED icu::BreakIterator *,
140 #   define UCASEMAP_BREAK_ITERATOR iter,
141 #   define UCASEMAP_BREAK_ITERATOR_NULL NULL,
142 #endif
143 
144 U_CFUNC int32_t
145 ustrcase_getCaseLocale(const char *locale);
146 
147 // TODO: swap src / dest if approved for new public api
148 /** Implements UStringCaseMapper. */
149 U_CFUNC int32_t U_CALLCONV
150 ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
151                          UChar *dest, int32_t destCapacity,
152                          const UChar *src, int32_t srcLength,
153                          icu::Edits *edits,
154                          UErrorCode &errorCode);
155 
156 /** Implements UStringCaseMapper. */
157 U_CFUNC int32_t U_CALLCONV
158 ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
159                          UChar *dest, int32_t destCapacity,
160                          const UChar *src, int32_t srcLength,
161                          icu::Edits *edits,
162                          UErrorCode &errorCode);
163 
164 #if !UCONFIG_NO_BREAK_ITERATION
165 
166 /** Implements UStringCaseMapper. */
167 U_CFUNC int32_t U_CALLCONV
168 ustrcase_internalToTitle(int32_t caseLocale, uint32_t options,
169                          icu::BreakIterator *iter,
170                          UChar *dest, int32_t destCapacity,
171                          const UChar *src, int32_t srcLength,
172                          icu::Edits *edits,
173                          UErrorCode &errorCode);
174 
175 #endif
176 
177 /** Implements UStringCaseMapper. */
178 U_CFUNC int32_t U_CALLCONV
179 ustrcase_internalFold(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
180                       UChar *dest, int32_t destCapacity,
181                       const UChar *src, int32_t srcLength,
182                       icu::Edits *edits,
183                       UErrorCode &errorCode);
184 
185 /**
186  * Common string case mapping implementation for ucasemap_toXyz() and UnicodeString::toXyz().
187  * Implements argument checking.
188  */
189 U_CFUNC int32_t
190 ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
191              UChar *dest, int32_t destCapacity,
192              const UChar *src, int32_t srcLength,
193              UStringCaseMapper *stringCaseMapper,
194              icu::Edits *edits,
195              UErrorCode &errorCode);
196 
197 /**
198  * Common string case mapping implementation for old-fashioned u_strToXyz() functions
199  * that allow the source string to overlap the destination buffer.
200  * Implements argument checking and internally works with an intermediate buffer if necessary.
201  */
202 U_CFUNC int32_t
203 ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
204                         UChar *dest, int32_t destCapacity,
205                         const UChar *src, int32_t srcLength,
206                         UStringCaseMapper *stringCaseMapper,
207                         UErrorCode &errorCode);
208 
209 /**
210  * UTF-8 string case mapping function type, used by ucasemap_mapUTF8().
211  * UTF-8 version of UStringCaseMapper.
212  * All error checking must be done.
213  * The UCaseMap must be fully initialized, with locale and/or iter set as needed.
214  */
215 typedef void U_CALLCONV
216 UTF8CaseMapper(int32_t caseLocale, uint32_t options,
217 #if !UCONFIG_NO_BREAK_ITERATION
218                icu::BreakIterator *iter,
219 #endif
220                const uint8_t *src, int32_t srcLength,
221                icu::ByteSink &sink, icu::Edits *edits,
222                UErrorCode &errorCode);
223 
224 #if !UCONFIG_NO_BREAK_ITERATION
225 
226 /** Implements UTF8CaseMapper. */
227 U_CFUNC void U_CALLCONV
228 ucasemap_internalUTF8ToTitle(int32_t caseLocale, uint32_t options,
229         icu::BreakIterator *iter,
230         const uint8_t *src, int32_t srcLength,
231         icu::ByteSink &sink, icu::Edits *edits,
232         UErrorCode &errorCode);
233 
234 #endif
235 
236 void
237 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
238                  const char *src, int32_t srcLength,
239                  UTF8CaseMapper *stringCaseMapper,
240                  icu::ByteSink &sink, icu::Edits *edits,
241                  UErrorCode &errorCode);
242 
243 /**
244  * Implements argument checking and buffer handling
245  * for UTF-8 string case mapping as a common function.
246  */
247 int32_t
248 ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
249                  char *dest, int32_t destCapacity,
250                  const char *src, int32_t srcLength,
251                  UTF8CaseMapper *stringCaseMapper,
252                  icu::Edits *edits,
253                  UErrorCode &errorCode);
254 
255 U_NAMESPACE_BEGIN
256 namespace GreekUpper {
257 
258 // Data bits.
259 static const uint32_t UPPER_MASK = 0x3ff;
260 static const uint32_t HAS_VOWEL = 0x1000;
261 static const uint32_t HAS_YPOGEGRAMMENI = 0x2000;
262 static const uint32_t HAS_ACCENT = 0x4000;
263 static const uint32_t HAS_DIALYTIKA = 0x8000;
264 // Further bits during data building and processing, not stored in the data map.
265 static const uint32_t HAS_COMBINING_DIALYTIKA = 0x10000;
266 static const uint32_t HAS_OTHER_GREEK_DIACRITIC = 0x20000;
267 
268 static const uint32_t HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT;
269 static const uint32_t HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA =
270         HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA;
271 static const uint32_t HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA;
272 
273 // State bits.
274 static const uint32_t AFTER_CASED = 1;
275 static const uint32_t AFTER_VOWEL_WITH_ACCENT = 2;
276 
277 uint32_t getLetterData(UChar32 c);
278 
279 /**
280  * Returns a non-zero value for each of the Greek combining diacritics
281  * listed in The Unicode Standard, version 8, chapter 7.2 Greek,
282  * plus some perispomeni look-alikes.
283  */
284 uint32_t getDiacriticData(UChar32 c);
285 
286 }  // namespace GreekUpper
287 U_NAMESPACE_END
288 
289 #endif  // __cplusplus
290 
291 #endif  // __UCASEMAP_IMP_H__
292