1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2009-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: normalizer2.cpp
11 * encoding: US-ASCII
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2009nov22
16 * created by: Markus W. Scherer
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_NORMALIZATION
22
23 #include "unicode/normalizer2.h"
24 #include "unicode/unistr.h"
25 #include "unicode/unorm.h"
26 #include "cstring.h"
27 #include "mutex.h"
28 #include "norm2allmodes.h"
29 #include "normalizer2impl.h"
30 #include "uassert.h"
31 #include "ucln_cmn.h"
32
33 using icu::Normalizer2Impl;
34
35 // NFC/NFD data machine-generated by gennorm2 --csource
36 #define INCLUDED_FROM_NORMALIZER2_CPP
37 #include "norm2_nfc_data.h"
38
39 U_NAMESPACE_BEGIN
40
41 // Public API dispatch via Normalizer2 subclasses -------------------------- ***
42
~Normalizer2()43 Normalizer2::~Normalizer2() {}
44
45 UBool
getRawDecomposition(UChar32,UnicodeString &) const46 Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
47 return FALSE;
48 }
49
50 UChar32
composePair(UChar32,UChar32) const51 Normalizer2::composePair(UChar32, UChar32) const {
52 return U_SENTINEL;
53 }
54
55 uint8_t
getCombiningClass(UChar32) const56 Normalizer2::getCombiningClass(UChar32 /*c*/) const {
57 return 0;
58 }
59
60 // Normalizer2 implementation for the old UNORM_NONE.
61 class NoopNormalizer2 : public Normalizer2 {
62 virtual ~NoopNormalizer2();
63
64 virtual UnicodeString &
normalize(const UnicodeString & src,UnicodeString & dest,UErrorCode & errorCode) const65 normalize(const UnicodeString &src,
66 UnicodeString &dest,
67 UErrorCode &errorCode) const {
68 if(U_SUCCESS(errorCode)) {
69 if(&dest!=&src) {
70 dest=src;
71 } else {
72 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
73 }
74 }
75 return dest;
76 }
77 virtual UnicodeString &
normalizeSecondAndAppend(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const78 normalizeSecondAndAppend(UnicodeString &first,
79 const UnicodeString &second,
80 UErrorCode &errorCode) const {
81 if(U_SUCCESS(errorCode)) {
82 if(&first!=&second) {
83 first.append(second);
84 } else {
85 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
86 }
87 }
88 return first;
89 }
90 virtual UnicodeString &
append(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const91 append(UnicodeString &first,
92 const UnicodeString &second,
93 UErrorCode &errorCode) const {
94 if(U_SUCCESS(errorCode)) {
95 if(&first!=&second) {
96 first.append(second);
97 } else {
98 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
99 }
100 }
101 return first;
102 }
103 virtual UBool
getDecomposition(UChar32,UnicodeString &) const104 getDecomposition(UChar32, UnicodeString &) const {
105 return FALSE;
106 }
107 // No need to override the default getRawDecomposition().
108 virtual UBool
isNormalized(const UnicodeString &,UErrorCode &) const109 isNormalized(const UnicodeString &, UErrorCode &) const {
110 return TRUE;
111 }
112 virtual UNormalizationCheckResult
quickCheck(const UnicodeString &,UErrorCode &) const113 quickCheck(const UnicodeString &, UErrorCode &) const {
114 return UNORM_YES;
115 }
116 virtual int32_t
spanQuickCheckYes(const UnicodeString & s,UErrorCode &) const117 spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const {
118 return s.length();
119 }
hasBoundaryBefore(UChar32) const120 virtual UBool hasBoundaryBefore(UChar32) const { return TRUE; }
hasBoundaryAfter(UChar32) const121 virtual UBool hasBoundaryAfter(UChar32) const { return TRUE; }
isInert(UChar32) const122 virtual UBool isInert(UChar32) const { return TRUE; }
123 };
124
~NoopNormalizer2()125 NoopNormalizer2::~NoopNormalizer2() {}
126
~Normalizer2WithImpl()127 Normalizer2WithImpl::~Normalizer2WithImpl() {}
128
~DecomposeNormalizer2()129 DecomposeNormalizer2::~DecomposeNormalizer2() {}
130
~ComposeNormalizer2()131 ComposeNormalizer2::~ComposeNormalizer2() {}
132
~FCDNormalizer2()133 FCDNormalizer2::~FCDNormalizer2() {}
134
135 // instance cache ---------------------------------------------------------- ***
136
~Norm2AllModes()137 Norm2AllModes::~Norm2AllModes() {
138 delete impl;
139 }
140
141 Norm2AllModes *
createInstance(Normalizer2Impl * impl,UErrorCode & errorCode)142 Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
143 if(U_FAILURE(errorCode)) {
144 delete impl;
145 return NULL;
146 }
147 Norm2AllModes *allModes=new Norm2AllModes(impl);
148 if(allModes==NULL) {
149 errorCode=U_MEMORY_ALLOCATION_ERROR;
150 delete impl;
151 return NULL;
152 }
153 return allModes;
154 }
155
156 Norm2AllModes *
createNFCInstance(UErrorCode & errorCode)157 Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
158 if(U_FAILURE(errorCode)) {
159 return NULL;
160 }
161 Normalizer2Impl *impl=new Normalizer2Impl;
162 if(impl==NULL) {
163 errorCode=U_MEMORY_ALLOCATION_ERROR;
164 return NULL;
165 }
166 impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
167 norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
168 return createInstance(impl, errorCode);
169 }
170
171 U_CDECL_BEGIN
172 static UBool U_CALLCONV uprv_normalizer2_cleanup();
173 U_CDECL_END
174
175 static Norm2AllModes *nfcSingleton;
176 static Normalizer2 *noopSingleton;
177
178 static icu::UInitOnce nfcInitOnce = U_INITONCE_INITIALIZER;
179 static icu::UInitOnce noopInitOnce = U_INITONCE_INITIALIZER;
180
181 // UInitOnce singleton initialization functions
initNFCSingleton(UErrorCode & errorCode)182 static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
183 nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
184 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
185 }
186
initNoopSingleton(UErrorCode & errorCode)187 static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
188 if(U_FAILURE(errorCode)) {
189 return;
190 }
191 noopSingleton=new NoopNormalizer2;
192 if(noopSingleton==NULL) {
193 errorCode=U_MEMORY_ALLOCATION_ERROR;
194 return;
195 }
196 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
197 }
198
199 U_CDECL_BEGIN
200
uprv_normalizer2_cleanup()201 static UBool U_CALLCONV uprv_normalizer2_cleanup() {
202 delete nfcSingleton;
203 nfcSingleton = NULL;
204 delete noopSingleton;
205 noopSingleton = NULL;
206 nfcInitOnce.reset();
207 noopInitOnce.reset();
208 return TRUE;
209 }
210
211 U_CDECL_END
212
213 const Norm2AllModes *
getNFCInstance(UErrorCode & errorCode)214 Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
215 if(U_FAILURE(errorCode)) { return NULL; }
216 umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
217 return nfcSingleton;
218 }
219
220 const Normalizer2 *
getNFCInstance(UErrorCode & errorCode)221 Normalizer2::getNFCInstance(UErrorCode &errorCode) {
222 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
223 return allModes!=NULL ? &allModes->comp : NULL;
224 }
225
226 const Normalizer2 *
getNFDInstance(UErrorCode & errorCode)227 Normalizer2::getNFDInstance(UErrorCode &errorCode) {
228 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
229 return allModes!=NULL ? &allModes->decomp : NULL;
230 }
231
getFCDInstance(UErrorCode & errorCode)232 const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
233 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
234 return allModes!=NULL ? &allModes->fcd : NULL;
235 }
236
getFCCInstance(UErrorCode & errorCode)237 const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
238 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
239 return allModes!=NULL ? &allModes->fcc : NULL;
240 }
241
getNoopInstance(UErrorCode & errorCode)242 const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
243 if(U_FAILURE(errorCode)) { return NULL; }
244 umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
245 return noopSingleton;
246 }
247
248 const Normalizer2Impl *
getNFCImpl(UErrorCode & errorCode)249 Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
250 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
251 return allModes!=NULL ? allModes->impl : NULL;
252 }
253
254 const Normalizer2Impl *
getImpl(const Normalizer2 * norm2)255 Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
256 return &((Normalizer2WithImpl *)norm2)->impl;
257 }
258
259 U_NAMESPACE_END
260
261 // C API ------------------------------------------------------------------- ***
262
263 U_NAMESPACE_USE
264
265 U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFCInstance(UErrorCode * pErrorCode)266 unorm2_getNFCInstance(UErrorCode *pErrorCode) {
267 return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
268 }
269
270 U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFDInstance(UErrorCode * pErrorCode)271 unorm2_getNFDInstance(UErrorCode *pErrorCode) {
272 return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
273 }
274
275 U_CAPI void U_EXPORT2
unorm2_close(UNormalizer2 * norm2)276 unorm2_close(UNormalizer2 *norm2) {
277 delete (Normalizer2 *)norm2;
278 }
279
280 U_CAPI int32_t U_EXPORT2
unorm2_normalize(const UNormalizer2 * norm2,const UChar * src,int32_t length,UChar * dest,int32_t capacity,UErrorCode * pErrorCode)281 unorm2_normalize(const UNormalizer2 *norm2,
282 const UChar *src, int32_t length,
283 UChar *dest, int32_t capacity,
284 UErrorCode *pErrorCode) {
285 if(U_FAILURE(*pErrorCode)) {
286 return 0;
287 }
288 if( (src==NULL ? length!=0 : length<-1) ||
289 (dest==NULL ? capacity!=0 : capacity<0) ||
290 (src==dest && src!=NULL)
291 ) {
292 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
293 return 0;
294 }
295 UnicodeString destString(dest, 0, capacity);
296 // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash.
297 if(length!=0) {
298 const Normalizer2 *n2=(const Normalizer2 *)norm2;
299 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
300 if(n2wi!=NULL) {
301 // Avoid duplicate argument checking and support NUL-terminated src.
302 ReorderingBuffer buffer(n2wi->impl, destString);
303 if(buffer.init(length, *pErrorCode)) {
304 n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode);
305 }
306 } else {
307 UnicodeString srcString(length<0, src, length);
308 n2->normalize(srcString, destString, *pErrorCode);
309 }
310 }
311 return destString.extract(dest, capacity, *pErrorCode);
312 }
313
314 static int32_t
normalizeSecondAndAppend(const UNormalizer2 * norm2,UChar * first,int32_t firstLength,int32_t firstCapacity,const UChar * second,int32_t secondLength,UBool doNormalize,UErrorCode * pErrorCode)315 normalizeSecondAndAppend(const UNormalizer2 *norm2,
316 UChar *first, int32_t firstLength, int32_t firstCapacity,
317 const UChar *second, int32_t secondLength,
318 UBool doNormalize,
319 UErrorCode *pErrorCode) {
320 if(U_FAILURE(*pErrorCode)) {
321 return 0;
322 }
323 if( (second==NULL ? secondLength!=0 : secondLength<-1) ||
324 (first==NULL ? (firstCapacity!=0 || firstLength!=0) :
325 (firstCapacity<0 || firstLength<-1)) ||
326 (first==second && first!=NULL)
327 ) {
328 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
329 return 0;
330 }
331 UnicodeString firstString(first, firstLength, firstCapacity);
332 firstLength=firstString.length(); // In case it was -1.
333 // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash.
334 if(secondLength!=0) {
335 const Normalizer2 *n2=(const Normalizer2 *)norm2;
336 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
337 if(n2wi!=NULL) {
338 // Avoid duplicate argument checking and support NUL-terminated src.
339 UnicodeString safeMiddle;
340 {
341 ReorderingBuffer buffer(n2wi->impl, firstString);
342 if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1
343 n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL,
344 doNormalize, safeMiddle, buffer, *pErrorCode);
345 }
346 } // The ReorderingBuffer destructor finalizes firstString.
347 if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
348 // Restore the modified suffix of the first string.
349 // This does not restore first[] array contents between firstLength and firstCapacity.
350 // (That might be uninitialized memory, as far as we know.)
351 if(first!=NULL) { /* don't dereference NULL */
352 safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
353 if(firstLength<firstCapacity) {
354 first[firstLength]=0; // NUL-terminate in case it was originally.
355 }
356 }
357 }
358 } else {
359 UnicodeString secondString(secondLength<0, second, secondLength);
360 if(doNormalize) {
361 n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
362 } else {
363 n2->append(firstString, secondString, *pErrorCode);
364 }
365 }
366 }
367 return firstString.extract(first, firstCapacity, *pErrorCode);
368 }
369
370 U_CAPI int32_t U_EXPORT2
unorm2_normalizeSecondAndAppend(const UNormalizer2 * norm2,UChar * first,int32_t firstLength,int32_t firstCapacity,const UChar * second,int32_t secondLength,UErrorCode * pErrorCode)371 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
372 UChar *first, int32_t firstLength, int32_t firstCapacity,
373 const UChar *second, int32_t secondLength,
374 UErrorCode *pErrorCode) {
375 return normalizeSecondAndAppend(norm2,
376 first, firstLength, firstCapacity,
377 second, secondLength,
378 TRUE, pErrorCode);
379 }
380
381 U_CAPI int32_t U_EXPORT2
unorm2_append(const UNormalizer2 * norm2,UChar * first,int32_t firstLength,int32_t firstCapacity,const UChar * second,int32_t secondLength,UErrorCode * pErrorCode)382 unorm2_append(const UNormalizer2 *norm2,
383 UChar *first, int32_t firstLength, int32_t firstCapacity,
384 const UChar *second, int32_t secondLength,
385 UErrorCode *pErrorCode) {
386 return normalizeSecondAndAppend(norm2,
387 first, firstLength, firstCapacity,
388 second, secondLength,
389 FALSE, pErrorCode);
390 }
391
392 U_CAPI int32_t U_EXPORT2
unorm2_getDecomposition(const UNormalizer2 * norm2,UChar32 c,UChar * decomposition,int32_t capacity,UErrorCode * pErrorCode)393 unorm2_getDecomposition(const UNormalizer2 *norm2,
394 UChar32 c, UChar *decomposition, int32_t capacity,
395 UErrorCode *pErrorCode) {
396 if(U_FAILURE(*pErrorCode)) {
397 return 0;
398 }
399 if(decomposition==NULL ? capacity!=0 : capacity<0) {
400 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
401 return 0;
402 }
403 UnicodeString destString(decomposition, 0, capacity);
404 if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
405 return destString.extract(decomposition, capacity, *pErrorCode);
406 } else {
407 return -1;
408 }
409 }
410
411 U_CAPI int32_t U_EXPORT2
unorm2_getRawDecomposition(const UNormalizer2 * norm2,UChar32 c,UChar * decomposition,int32_t capacity,UErrorCode * pErrorCode)412 unorm2_getRawDecomposition(const UNormalizer2 *norm2,
413 UChar32 c, UChar *decomposition, int32_t capacity,
414 UErrorCode *pErrorCode) {
415 if(U_FAILURE(*pErrorCode)) {
416 return 0;
417 }
418 if(decomposition==NULL ? capacity!=0 : capacity<0) {
419 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
420 return 0;
421 }
422 UnicodeString destString(decomposition, 0, capacity);
423 if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
424 return destString.extract(decomposition, capacity, *pErrorCode);
425 } else {
426 return -1;
427 }
428 }
429
430 U_CAPI UChar32 U_EXPORT2
unorm2_composePair(const UNormalizer2 * norm2,UChar32 a,UChar32 b)431 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
432 return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
433 }
434
435 U_CAPI uint8_t U_EXPORT2
unorm2_getCombiningClass(const UNormalizer2 * norm2,UChar32 c)436 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
437 return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
438 }
439
440 U_CAPI UBool U_EXPORT2
unorm2_isNormalized(const UNormalizer2 * norm2,const UChar * s,int32_t length,UErrorCode * pErrorCode)441 unorm2_isNormalized(const UNormalizer2 *norm2,
442 const UChar *s, int32_t length,
443 UErrorCode *pErrorCode) {
444 if(U_FAILURE(*pErrorCode)) {
445 return 0;
446 }
447 if((s==NULL && length!=0) || length<-1) {
448 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
449 return 0;
450 }
451 UnicodeString sString(length<0, s, length);
452 return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
453 }
454
455 U_CAPI UNormalizationCheckResult U_EXPORT2
unorm2_quickCheck(const UNormalizer2 * norm2,const UChar * s,int32_t length,UErrorCode * pErrorCode)456 unorm2_quickCheck(const UNormalizer2 *norm2,
457 const UChar *s, int32_t length,
458 UErrorCode *pErrorCode) {
459 if(U_FAILURE(*pErrorCode)) {
460 return UNORM_NO;
461 }
462 if((s==NULL && length!=0) || length<-1) {
463 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
464 return UNORM_NO;
465 }
466 UnicodeString sString(length<0, s, length);
467 return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
468 }
469
470 U_CAPI int32_t U_EXPORT2
unorm2_spanQuickCheckYes(const UNormalizer2 * norm2,const UChar * s,int32_t length,UErrorCode * pErrorCode)471 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
472 const UChar *s, int32_t length,
473 UErrorCode *pErrorCode) {
474 if(U_FAILURE(*pErrorCode)) {
475 return 0;
476 }
477 if((s==NULL && length!=0) || length<-1) {
478 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
479 return 0;
480 }
481 UnicodeString sString(length<0, s, length);
482 return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
483 }
484
485 U_CAPI UBool U_EXPORT2
unorm2_hasBoundaryBefore(const UNormalizer2 * norm2,UChar32 c)486 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
487 return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
488 }
489
490 U_CAPI UBool U_EXPORT2
unorm2_hasBoundaryAfter(const UNormalizer2 * norm2,UChar32 c)491 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
492 return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
493 }
494
495 U_CAPI UBool U_EXPORT2
unorm2_isInert(const UNormalizer2 * norm2,UChar32 c)496 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
497 return ((const Normalizer2 *)norm2)->isInert(c);
498 }
499
500 // Some properties APIs ---------------------------------------------------- ***
501
502 U_CAPI uint8_t U_EXPORT2
u_getCombiningClass(UChar32 c)503 u_getCombiningClass(UChar32 c) {
504 UErrorCode errorCode=U_ZERO_ERROR;
505 const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
506 if(U_SUCCESS(errorCode)) {
507 return nfd->getCombiningClass(c);
508 } else {
509 return 0;
510 }
511 }
512
513 U_CFUNC uint16_t
unorm_getFCD16(UChar32 c)514 unorm_getFCD16(UChar32 c) {
515 UErrorCode errorCode=U_ZERO_ERROR;
516 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
517 if(U_SUCCESS(errorCode)) {
518 return impl->getFCD16(c);
519 } else {
520 return 0;
521 }
522 }
523
524 #endif // !UCONFIG_NO_NORMALIZATION
525