1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // casemap.h
5 // created: 2017jan12 Markus W. Scherer
6 
7 #ifndef __CASEMAP_H__
8 #define __CASEMAP_H__
9 
10 #include "unicode/utypes.h"
11 #include "unicode/stringpiece.h"
12 #include "unicode/uobject.h"
13 
14 /**
15  * \file
16  * \brief C++ API: Low-level C++ case mapping functions.
17  */
18 
19 U_NAMESPACE_BEGIN
20 
21 class BreakIterator;
22 class ByteSink;
23 class Edits;
24 
25 /**
26  * Low-level C++ case mapping functions.
27  *
28  * @stable ICU 59
29  */
30 class U_COMMON_API CaseMap U_FINAL : public UMemory {
31 public:
32     /**
33      * Lowercases a UTF-16 string and optionally records edits.
34      * Casing is locale-dependent and context-sensitive.
35      * The result may be longer or shorter than the original.
36      * The source string and the destination buffer must not overlap.
37      *
38      * @param locale    The locale ID. ("" = root locale, NULL = default locale.)
39      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
40      * @param src       The original string.
41      * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
42      * @param dest      A buffer for the result string. The result will be NUL-terminated if
43      *                  the buffer is large enough.
44      *                  The contents is undefined in case of failure.
45      * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
46      *                  dest may be NULL and the function will only return the length of the result
47      *                  without writing any of the result string.
48      * @param edits     Records edits for index mapping, working with styled text,
49      *                  and getting only changes (if any).
50      *                  The Edits contents is undefined if any error occurs.
51      *                  This function calls edits->reset() first unless
52      *                  options includes U_EDITS_NO_RESET. edits can be NULL.
53      * @param errorCode Reference to an in/out error code value
54      *                  which must not indicate a failure before the function call.
55      * @return The length of the result string, if successful.
56      *         When the result would be longer than destCapacity,
57      *         the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
58      *
59      * @see u_strToLower
60      * @stable ICU 59
61      */
62      static int32_t toLower(
63             const char *locale, uint32_t options,
64             const char16_t *src, int32_t srcLength,
65             char16_t *dest, int32_t destCapacity, Edits *edits,
66             UErrorCode &errorCode);
67 
68     /**
69      * Uppercases a UTF-16 string and optionally records edits.
70      * Casing is locale-dependent and context-sensitive.
71      * The result may be longer or shorter than the original.
72      * The source string and the destination buffer must not overlap.
73      *
74      * @param locale    The locale ID. ("" = root locale, NULL = default locale.)
75      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
76      * @param src       The original string.
77      * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
78      * @param dest      A buffer for the result string. The result will be NUL-terminated if
79      *                  the buffer is large enough.
80      *                  The contents is undefined in case of failure.
81      * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
82      *                  dest may be NULL and the function will only return the length of the result
83      *                  without writing any of the result string.
84      * @param edits     Records edits for index mapping, working with styled text,
85      *                  and getting only changes (if any).
86      *                  The Edits contents is undefined if any error occurs.
87      *                  This function calls edits->reset() first unless
88      *                  options includes U_EDITS_NO_RESET. edits can be NULL.
89      * @param errorCode Reference to an in/out error code value
90      *                  which must not indicate a failure before the function call.
91      * @return The length of the result string, if successful.
92      *         When the result would be longer than destCapacity,
93      *         the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
94      *
95      * @see u_strToUpper
96      * @stable ICU 59
97      */
98     static int32_t toUpper(
99             const char *locale, uint32_t options,
100             const char16_t *src, int32_t srcLength,
101             char16_t *dest, int32_t destCapacity, Edits *edits,
102             UErrorCode &errorCode);
103 
104 #if !UCONFIG_NO_BREAK_ITERATION
105 
106     /**
107      * Titlecases a UTF-16 string and optionally records edits.
108      * Casing is locale-dependent and context-sensitive.
109      * The result may be longer or shorter than the original.
110      * The source string and the destination buffer must not overlap.
111      *
112      * Titlecasing uses a break iterator to find the first characters of words
113      * that are to be titlecased. It titlecases those characters and lowercases
114      * all others. (This can be modified with options bits.)
115      *
116      * @param locale    The locale ID. ("" = root locale, NULL = default locale.)
117      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
118      *                  U_TITLECASE_NO_LOWERCASE,
119      *                  U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
120      *                  U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
121      * @param iter      A break iterator to find the first characters of words that are to be titlecased.
122      *                  It is set to the source string (setText())
123      *                  and used one or more times for iteration (first() and next()).
124      *                  If NULL, then a word break iterator for the locale is used
125      *                  (or something equivalent).
126      * @param src       The original string.
127      * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
128      * @param dest      A buffer for the result string. The result will be NUL-terminated if
129      *                  the buffer is large enough.
130      *                  The contents is undefined in case of failure.
131      * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
132      *                  dest may be NULL and the function will only return the length of the result
133      *                  without writing any of the result string.
134      * @param edits     Records edits for index mapping, working with styled text,
135      *                  and getting only changes (if any).
136      *                  The Edits contents is undefined if any error occurs.
137      *                  This function calls edits->reset() first unless
138      *                  options includes U_EDITS_NO_RESET. edits can be NULL.
139      * @param errorCode Reference to an in/out error code value
140      *                  which must not indicate a failure before the function call.
141      * @return The length of the result string, if successful.
142      *         When the result would be longer than destCapacity,
143      *         the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
144      *
145      * @see u_strToTitle
146      * @see ucasemap_toTitle
147      * @stable ICU 59
148      */
149     static int32_t toTitle(
150             const char *locale, uint32_t options, BreakIterator *iter,
151             const char16_t *src, int32_t srcLength,
152             char16_t *dest, int32_t destCapacity, Edits *edits,
153             UErrorCode &errorCode);
154 
155 #endif  // UCONFIG_NO_BREAK_ITERATION
156 
157     /**
158      * Case-folds a UTF-16 string and optionally records edits.
159      *
160      * Case folding is locale-independent and not context-sensitive,
161      * but there is an option for whether to include or exclude mappings for dotted I
162      * and dotless i that are marked with 'T' in CaseFolding.txt.
163      *
164      * The result may be longer or shorter than the original.
165      * The source string and the destination buffer must not overlap.
166      *
167      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
168      *                  U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I.
169      * @param src       The original string.
170      * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
171      * @param dest      A buffer for the result string. The result will be NUL-terminated if
172      *                  the buffer is large enough.
173      *                  The contents is undefined in case of failure.
174      * @param destCapacity The size of the buffer (number of char16_ts). If it is 0, then
175      *                  dest may be NULL and the function will only return the length of the result
176      *                  without writing any of the result string.
177      * @param edits     Records edits for index mapping, working with styled text,
178      *                  and getting only changes (if any).
179      *                  The Edits contents is undefined if any error occurs.
180      *                  This function calls edits->reset() first unless
181      *                  options includes U_EDITS_NO_RESET. edits can be NULL.
182      * @param errorCode Reference to an in/out error code value
183      *                  which must not indicate a failure before the function call.
184      * @return The length of the result string, if successful.
185      *         When the result would be longer than destCapacity,
186      *         the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
187      *
188      * @see u_strFoldCase
189      * @stable ICU 59
190      */
191     static int32_t fold(
192             uint32_t options,
193             const char16_t *src, int32_t srcLength,
194             char16_t *dest, int32_t destCapacity, Edits *edits,
195             UErrorCode &errorCode);
196 
197     /**
198      * Lowercases a UTF-8 string and optionally records edits.
199      * Casing is locale-dependent and context-sensitive.
200      * The result may be longer or shorter than the original.
201      *
202      * @param locale    The locale ID. ("" = root locale, NULL = default locale.)
203      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
204      * @param src       The original string.
205      * @param sink      A ByteSink to which the result string is written.
206      *                  sink.Flush() is called at the end.
207      * @param edits     Records edits for index mapping, working with styled text,
208      *                  and getting only changes (if any).
209      *                  The Edits contents is undefined if any error occurs.
210      *                  This function calls edits->reset() first unless
211      *                  options includes U_EDITS_NO_RESET. edits can be NULL.
212      * @param errorCode Reference to an in/out error code value
213      *                  which must not indicate a failure before the function call.
214      *
215      * @see ucasemap_utf8ToLower
216      * @stable ICU 60
217      */
218     static void utf8ToLower(
219             const char *locale, uint32_t options,
220             StringPiece src, ByteSink &sink, Edits *edits,
221             UErrorCode &errorCode);
222 
223     /**
224      * Uppercases a UTF-8 string and optionally records edits.
225      * Casing is locale-dependent and context-sensitive.
226      * The result may be longer or shorter than the original.
227      *
228      * @param locale    The locale ID. ("" = root locale, NULL = default locale.)
229      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
230      * @param src       The original string.
231      * @param sink      A ByteSink to which the result string is written.
232      *                  sink.Flush() is called at the end.
233      * @param edits     Records edits for index mapping, working with styled text,
234      *                  and getting only changes (if any).
235      *                  The Edits contents is undefined if any error occurs.
236      *                  This function calls edits->reset() first unless
237      *                  options includes U_EDITS_NO_RESET. edits can be NULL.
238      * @param errorCode Reference to an in/out error code value
239      *                  which must not indicate a failure before the function call.
240      *
241      * @see ucasemap_utf8ToUpper
242      * @stable ICU 60
243      */
244     static void utf8ToUpper(
245             const char *locale, uint32_t options,
246             StringPiece src, ByteSink &sink, Edits *edits,
247             UErrorCode &errorCode);
248 
249 #if !UCONFIG_NO_BREAK_ITERATION
250 
251     /**
252      * Titlecases a UTF-8 string and optionally records edits.
253      * Casing is locale-dependent and context-sensitive.
254      * The result may be longer or shorter than the original.
255      *
256      * Titlecasing uses a break iterator to find the first characters of words
257      * that are to be titlecased. It titlecases those characters and lowercases
258      * all others. (This can be modified with options bits.)
259      *
260      * @param locale    The locale ID. ("" = root locale, NULL = default locale.)
261      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
262      *                  U_TITLECASE_NO_LOWERCASE,
263      *                  U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
264      *                  U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
265      * @param iter      A break iterator to find the first characters of words that are to be titlecased.
266      *                  It is set to the source string (setUText())
267      *                  and used one or more times for iteration (first() and next()).
268      *                  If NULL, then a word break iterator for the locale is used
269      *                  (or something equivalent).
270      * @param src       The original string.
271      * @param sink      A ByteSink to which the result string is written.
272      *                  sink.Flush() is called at the end.
273      * @param edits     Records edits for index mapping, working with styled text,
274      *                  and getting only changes (if any).
275      *                  The Edits contents is undefined if any error occurs.
276      *                  This function calls edits->reset() first unless
277      *                  options includes U_EDITS_NO_RESET. edits can be NULL.
278      * @param errorCode Reference to an in/out error code value
279      *                  which must not indicate a failure before the function call.
280      *
281      * @see ucasemap_utf8ToTitle
282      * @stable ICU 60
283      */
284     static void utf8ToTitle(
285             const char *locale, uint32_t options, BreakIterator *iter,
286             StringPiece src, ByteSink &sink, Edits *edits,
287             UErrorCode &errorCode);
288 
289 #endif  // UCONFIG_NO_BREAK_ITERATION
290 
291     /**
292      * Case-folds a UTF-8 string and optionally records edits.
293      *
294      * Case folding is locale-independent and not context-sensitive,
295      * but there is an option for whether to include or exclude mappings for dotted I
296      * and dotless i that are marked with 'T' in CaseFolding.txt.
297      *
298      * The result may be longer or shorter than the original.
299      *
300      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
301      * @param src       The original string.
302      * @param sink      A ByteSink to which the result string is written.
303      *                  sink.Flush() is called at the end.
304      * @param edits     Records edits for index mapping, working with styled text,
305      *                  and getting only changes (if any).
306      *                  The Edits contents is undefined if any error occurs.
307      *                  This function calls edits->reset() first unless
308      *                  options includes U_EDITS_NO_RESET. edits can be NULL.
309      * @param errorCode Reference to an in/out error code value
310      *                  which must not indicate a failure before the function call.
311      *
312      * @see ucasemap_utf8FoldCase
313      * @stable ICU 60
314      */
315     static void utf8Fold(
316             uint32_t options,
317             StringPiece src, ByteSink &sink, Edits *edits,
318             UErrorCode &errorCode);
319 
320     /**
321      * Lowercases a UTF-8 string and optionally records edits.
322      * Casing is locale-dependent and context-sensitive.
323      * The result may be longer or shorter than the original.
324      * The source string and the destination buffer must not overlap.
325      *
326      * @param locale    The locale ID. ("" = root locale, NULL = default locale.)
327      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
328      * @param src       The original string.
329      * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
330      * @param dest      A buffer for the result string. The result will be NUL-terminated if
331      *                  the buffer is large enough.
332      *                  The contents is undefined in case of failure.
333      * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
334      *                  dest may be NULL and the function will only return the length of the result
335      *                  without writing any of the result string.
336      * @param edits     Records edits for index mapping, working with styled text,
337      *                  and getting only changes (if any).
338      *                  The Edits contents is undefined if any error occurs.
339      *                  This function calls edits->reset() first unless
340      *                  options includes U_EDITS_NO_RESET. edits can be NULL.
341      * @param errorCode Reference to an in/out error code value
342      *                  which must not indicate a failure before the function call.
343      * @return The length of the result string, if successful.
344      *         When the result would be longer than destCapacity,
345      *         the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
346      *
347      * @see ucasemap_utf8ToLower
348      * @stable ICU 59
349      */
350     static int32_t utf8ToLower(
351             const char *locale, uint32_t options,
352             const char *src, int32_t srcLength,
353             char *dest, int32_t destCapacity, Edits *edits,
354             UErrorCode &errorCode);
355 
356     /**
357      * Uppercases a UTF-8 string and optionally records edits.
358      * Casing is locale-dependent and context-sensitive.
359      * The result may be longer or shorter than the original.
360      * The source string and the destination buffer must not overlap.
361      *
362      * @param locale    The locale ID. ("" = root locale, NULL = default locale.)
363      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
364      * @param src       The original string.
365      * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
366      * @param dest      A buffer for the result string. The result will be NUL-terminated if
367      *                  the buffer is large enough.
368      *                  The contents is undefined in case of failure.
369      * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
370      *                  dest may be NULL and the function will only return the length of the result
371      *                  without writing any of the result string.
372      * @param edits     Records edits for index mapping, working with styled text,
373      *                  and getting only changes (if any).
374      *                  The Edits contents is undefined if any error occurs.
375      *                  This function calls edits->reset() first unless
376      *                  options includes U_EDITS_NO_RESET. edits can be NULL.
377      * @param errorCode Reference to an in/out error code value
378      *                  which must not indicate a failure before the function call.
379      * @return The length of the result string, if successful.
380      *         When the result would be longer than destCapacity,
381      *         the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
382      *
383      * @see ucasemap_utf8ToUpper
384      * @stable ICU 59
385      */
386     static int32_t utf8ToUpper(
387             const char *locale, uint32_t options,
388             const char *src, int32_t srcLength,
389             char *dest, int32_t destCapacity, Edits *edits,
390             UErrorCode &errorCode);
391 
392 #if !UCONFIG_NO_BREAK_ITERATION
393 
394     /**
395      * Titlecases a UTF-8 string and optionally records edits.
396      * Casing is locale-dependent and context-sensitive.
397      * The result may be longer or shorter than the original.
398      * The source string and the destination buffer must not overlap.
399      *
400      * Titlecasing uses a break iterator to find the first characters of words
401      * that are to be titlecased. It titlecases those characters and lowercases
402      * all others. (This can be modified with options bits.)
403      *
404      * @param locale    The locale ID. ("" = root locale, NULL = default locale.)
405      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
406      *                  U_TITLECASE_NO_LOWERCASE,
407      *                  U_TITLECASE_NO_BREAK_ADJUSTMENT, U_TITLECASE_ADJUST_TO_CASED,
408      *                  U_TITLECASE_WHOLE_STRING, U_TITLECASE_SENTENCES.
409      * @param iter      A break iterator to find the first characters of words that are to be titlecased.
410      *                  It is set to the source string (setUText())
411      *                  and used one or more times for iteration (first() and next()).
412      *                  If NULL, then a word break iterator for the locale is used
413      *                  (or something equivalent).
414      * @param src       The original string.
415      * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
416      * @param dest      A buffer for the result string. The result will be NUL-terminated if
417      *                  the buffer is large enough.
418      *                  The contents is undefined in case of failure.
419      * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
420      *                  dest may be NULL and the function will only return the length of the result
421      *                  without writing any of the result string.
422      * @param edits     Records edits for index mapping, working with styled text,
423      *                  and getting only changes (if any).
424      *                  The Edits contents is undefined if any error occurs.
425      *                  This function calls edits->reset() first unless
426      *                  options includes U_EDITS_NO_RESET. edits can be NULL.
427      * @param errorCode Reference to an in/out error code value
428      *                  which must not indicate a failure before the function call.
429      * @return The length of the result string, if successful.
430      *         When the result would be longer than destCapacity,
431      *         the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
432      *
433      * @see ucasemap_utf8ToTitle
434      * @stable ICU 59
435      */
436     static int32_t utf8ToTitle(
437             const char *locale, uint32_t options, BreakIterator *iter,
438             const char *src, int32_t srcLength,
439             char *dest, int32_t destCapacity, Edits *edits,
440             UErrorCode &errorCode);
441 
442 #endif  // UCONFIG_NO_BREAK_ITERATION
443 
444     /**
445      * Case-folds a UTF-8 string and optionally records edits.
446      *
447      * Case folding is locale-independent and not context-sensitive,
448      * but there is an option for whether to include or exclude mappings for dotted I
449      * and dotless i that are marked with 'T' in CaseFolding.txt.
450      *
451      * The result may be longer or shorter than the original.
452      * The source string and the destination buffer must not overlap.
453      *
454      * @param options   Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT, U_EDITS_NO_RESET,
455      *                  U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I.
456      * @param src       The original string.
457      * @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
458      * @param dest      A buffer for the result string. The result will be NUL-terminated if
459      *                  the buffer is large enough.
460      *                  The contents is undefined in case of failure.
461      * @param destCapacity The size of the buffer (number of bytes). If it is 0, then
462      *                  dest may be NULL and the function will only return the length of the result
463      *                  without writing any of the result string.
464      * @param edits     Records edits for index mapping, working with styled text,
465      *                  and getting only changes (if any).
466      *                  The Edits contents is undefined if any error occurs.
467      *                  This function calls edits->reset() first unless
468      *                  options includes U_EDITS_NO_RESET. edits can be NULL.
469      * @param errorCode Reference to an in/out error code value
470      *                  which must not indicate a failure before the function call.
471      * @return The length of the result string, if successful.
472      *         When the result would be longer than destCapacity,
473      *         the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
474      *
475      * @see ucasemap_utf8FoldCase
476      * @stable ICU 59
477      */
478     static int32_t utf8Fold(
479             uint32_t options,
480             const char *src, int32_t srcLength,
481             char *dest, int32_t destCapacity, Edits *edits,
482             UErrorCode &errorCode);
483 
484 private:
485     CaseMap() = delete;
486     CaseMap(const CaseMap &other) = delete;
487     CaseMap &operator=(const CaseMap &other) = delete;
488 };
489 
490 U_NAMESPACE_END
491 
492 #endif  // __CASEMAP_H__
493