1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2009-2014, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: normalizer2.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2009nov22
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18
19 #if !UCONFIG_NO_NORMALIZATION
20
21 #include "unicode/normalizer2.h"
22 #include "unicode/unistr.h"
23 #include "unicode/unorm.h"
24 #include "cstring.h"
25 #include "mutex.h"
26 #include "norm2allmodes.h"
27 #include "normalizer2impl.h"
28 #include "uassert.h"
29 #include "ucln_cmn.h"
30
31 using icu::Normalizer2Impl;
32
33 // NFC/NFD data machine-generated by gennorm2 --csource
34 #include "norm2_nfc_data.h"
35
36 U_NAMESPACE_BEGIN
37
38 // Public API dispatch via Normalizer2 subclasses -------------------------- ***
39
~Normalizer2()40 Normalizer2::~Normalizer2() {}
41
42 UBool
getRawDecomposition(UChar32,UnicodeString &) const43 Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
44 return FALSE;
45 }
46
47 UChar32
composePair(UChar32,UChar32) const48 Normalizer2::composePair(UChar32, UChar32) const {
49 return U_SENTINEL;
50 }
51
52 uint8_t
getCombiningClass(UChar32) const53 Normalizer2::getCombiningClass(UChar32 /*c*/) const {
54 return 0;
55 }
56
57 // Normalizer2 implementation for the old UNORM_NONE.
58 class NoopNormalizer2 : public Normalizer2 {
59 virtual ~NoopNormalizer2();
60
61 virtual UnicodeString &
normalize(const UnicodeString & src,UnicodeString & dest,UErrorCode & errorCode) const62 normalize(const UnicodeString &src,
63 UnicodeString &dest,
64 UErrorCode &errorCode) const {
65 if(U_SUCCESS(errorCode)) {
66 if(&dest!=&src) {
67 dest=src;
68 } else {
69 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
70 }
71 }
72 return dest;
73 }
74 virtual UnicodeString &
normalizeSecondAndAppend(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const75 normalizeSecondAndAppend(UnicodeString &first,
76 const UnicodeString &second,
77 UErrorCode &errorCode) const {
78 if(U_SUCCESS(errorCode)) {
79 if(&first!=&second) {
80 first.append(second);
81 } else {
82 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
83 }
84 }
85 return first;
86 }
87 virtual UnicodeString &
append(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const88 append(UnicodeString &first,
89 const UnicodeString &second,
90 UErrorCode &errorCode) const {
91 if(U_SUCCESS(errorCode)) {
92 if(&first!=&second) {
93 first.append(second);
94 } else {
95 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
96 }
97 }
98 return first;
99 }
100 virtual UBool
getDecomposition(UChar32,UnicodeString &) const101 getDecomposition(UChar32, UnicodeString &) const {
102 return FALSE;
103 }
104 // No need to override the default getRawDecomposition().
105 virtual UBool
isNormalized(const UnicodeString &,UErrorCode &) const106 isNormalized(const UnicodeString &, UErrorCode &) const {
107 return TRUE;
108 }
109 virtual UNormalizationCheckResult
quickCheck(const UnicodeString &,UErrorCode &) const110 quickCheck(const UnicodeString &, UErrorCode &) const {
111 return UNORM_YES;
112 }
113 virtual int32_t
spanQuickCheckYes(const UnicodeString & s,UErrorCode &) const114 spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const {
115 return s.length();
116 }
hasBoundaryBefore(UChar32) const117 virtual UBool hasBoundaryBefore(UChar32) const { return TRUE; }
hasBoundaryAfter(UChar32) const118 virtual UBool hasBoundaryAfter(UChar32) const { return TRUE; }
isInert(UChar32) const119 virtual UBool isInert(UChar32) const { return TRUE; }
120 };
121
~NoopNormalizer2()122 NoopNormalizer2::~NoopNormalizer2() {}
123
~Normalizer2WithImpl()124 Normalizer2WithImpl::~Normalizer2WithImpl() {}
125
~DecomposeNormalizer2()126 DecomposeNormalizer2::~DecomposeNormalizer2() {}
127
~ComposeNormalizer2()128 ComposeNormalizer2::~ComposeNormalizer2() {}
129
~FCDNormalizer2()130 FCDNormalizer2::~FCDNormalizer2() {}
131
132 // instance cache ---------------------------------------------------------- ***
133
~Norm2AllModes()134 Norm2AllModes::~Norm2AllModes() {
135 delete impl;
136 }
137
138 Norm2AllModes *
createInstance(Normalizer2Impl * impl,UErrorCode & errorCode)139 Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
140 if(U_FAILURE(errorCode)) {
141 delete impl;
142 return NULL;
143 }
144 Norm2AllModes *allModes=new Norm2AllModes(impl);
145 if(allModes==NULL) {
146 errorCode=U_MEMORY_ALLOCATION_ERROR;
147 delete impl;
148 return NULL;
149 }
150 return allModes;
151 }
152
153 Norm2AllModes *
createNFCInstance(UErrorCode & errorCode)154 Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
155 if(U_FAILURE(errorCode)) {
156 return NULL;
157 }
158 Normalizer2Impl *impl=new Normalizer2Impl;
159 if(impl==NULL) {
160 errorCode=U_MEMORY_ALLOCATION_ERROR;
161 return NULL;
162 }
163 impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
164 norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
165 return createInstance(impl, errorCode);
166 }
167
168 U_CDECL_BEGIN
169 static UBool U_CALLCONV uprv_normalizer2_cleanup();
170 U_CDECL_END
171
172 static Norm2AllModes *nfcSingleton;
173 static Normalizer2 *noopSingleton;
174
175 static icu::UInitOnce nfcInitOnce = U_INITONCE_INITIALIZER;
176 static icu::UInitOnce noopInitOnce = U_INITONCE_INITIALIZER;
177
178 // UInitOnce singleton initialization functions
initNFCSingleton(UErrorCode & errorCode)179 static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
180 nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
181 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
182 }
183
initNoopSingleton(UErrorCode & errorCode)184 static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
185 if(U_FAILURE(errorCode)) {
186 return;
187 }
188 noopSingleton=new NoopNormalizer2;
189 if(noopSingleton==NULL) {
190 errorCode=U_MEMORY_ALLOCATION_ERROR;
191 return;
192 }
193 ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
194 }
195
196 U_CDECL_BEGIN
197
uprv_normalizer2_cleanup()198 static UBool U_CALLCONV uprv_normalizer2_cleanup() {
199 delete nfcSingleton;
200 nfcSingleton = NULL;
201 delete noopSingleton;
202 noopSingleton = NULL;
203 nfcInitOnce.reset();
204 noopInitOnce.reset();
205 return TRUE;
206 }
207
208 U_CDECL_END
209
210 const Norm2AllModes *
getNFCInstance(UErrorCode & errorCode)211 Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
212 if(U_FAILURE(errorCode)) { return NULL; }
213 umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
214 return nfcSingleton;
215 }
216
217 const Normalizer2 *
getNFCInstance(UErrorCode & errorCode)218 Normalizer2::getNFCInstance(UErrorCode &errorCode) {
219 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
220 return allModes!=NULL ? &allModes->comp : NULL;
221 }
222
223 const Normalizer2 *
getNFDInstance(UErrorCode & errorCode)224 Normalizer2::getNFDInstance(UErrorCode &errorCode) {
225 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
226 return allModes!=NULL ? &allModes->decomp : NULL;
227 }
228
getFCDInstance(UErrorCode & errorCode)229 const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
230 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
231 return allModes!=NULL ? &allModes->fcd : NULL;
232 }
233
getFCCInstance(UErrorCode & errorCode)234 const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
235 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
236 return allModes!=NULL ? &allModes->fcc : NULL;
237 }
238
getNoopInstance(UErrorCode & errorCode)239 const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
240 if(U_FAILURE(errorCode)) { return NULL; }
241 umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
242 return noopSingleton;
243 }
244
245 const Normalizer2Impl *
getNFCImpl(UErrorCode & errorCode)246 Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
247 const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
248 return allModes!=NULL ? allModes->impl : NULL;
249 }
250
251 const Normalizer2Impl *
getImpl(const Normalizer2 * norm2)252 Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
253 return &((Normalizer2WithImpl *)norm2)->impl;
254 }
255
256 U_NAMESPACE_END
257
258 // C API ------------------------------------------------------------------- ***
259
260 U_NAMESPACE_USE
261
262 U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFCInstance(UErrorCode * pErrorCode)263 unorm2_getNFCInstance(UErrorCode *pErrorCode) {
264 return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
265 }
266
267 U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFDInstance(UErrorCode * pErrorCode)268 unorm2_getNFDInstance(UErrorCode *pErrorCode) {
269 return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
270 }
271
272 U_CAPI void U_EXPORT2
unorm2_close(UNormalizer2 * norm2)273 unorm2_close(UNormalizer2 *norm2) {
274 delete (Normalizer2 *)norm2;
275 }
276
277 U_CAPI int32_t U_EXPORT2
unorm2_normalize(const UNormalizer2 * norm2,const UChar * src,int32_t length,UChar * dest,int32_t capacity,UErrorCode * pErrorCode)278 unorm2_normalize(const UNormalizer2 *norm2,
279 const UChar *src, int32_t length,
280 UChar *dest, int32_t capacity,
281 UErrorCode *pErrorCode) {
282 if(U_FAILURE(*pErrorCode)) {
283 return 0;
284 }
285 if( (src==NULL ? length!=0 : length<-1) ||
286 (dest==NULL ? capacity!=0 : capacity<0) ||
287 (src==dest && src!=NULL)
288 ) {
289 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
290 return 0;
291 }
292 UnicodeString destString(dest, 0, capacity);
293 // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash.
294 if(length!=0) {
295 const Normalizer2 *n2=(const Normalizer2 *)norm2;
296 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
297 if(n2wi!=NULL) {
298 // Avoid duplicate argument checking and support NUL-terminated src.
299 ReorderingBuffer buffer(n2wi->impl, destString);
300 if(buffer.init(length, *pErrorCode)) {
301 n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode);
302 }
303 } else {
304 UnicodeString srcString(length<0, src, length);
305 n2->normalize(srcString, destString, *pErrorCode);
306 }
307 }
308 return destString.extract(dest, capacity, *pErrorCode);
309 }
310
311 static int32_t
normalizeSecondAndAppend(const UNormalizer2 * norm2,UChar * first,int32_t firstLength,int32_t firstCapacity,const UChar * second,int32_t secondLength,UBool doNormalize,UErrorCode * pErrorCode)312 normalizeSecondAndAppend(const UNormalizer2 *norm2,
313 UChar *first, int32_t firstLength, int32_t firstCapacity,
314 const UChar *second, int32_t secondLength,
315 UBool doNormalize,
316 UErrorCode *pErrorCode) {
317 if(U_FAILURE(*pErrorCode)) {
318 return 0;
319 }
320 if( (second==NULL ? secondLength!=0 : secondLength<-1) ||
321 (first==NULL ? (firstCapacity!=0 || firstLength!=0) :
322 (firstCapacity<0 || firstLength<-1)) ||
323 (first==second && first!=NULL)
324 ) {
325 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
326 return 0;
327 }
328 UnicodeString firstString(first, firstLength, firstCapacity);
329 firstLength=firstString.length(); // In case it was -1.
330 // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash.
331 if(secondLength!=0) {
332 const Normalizer2 *n2=(const Normalizer2 *)norm2;
333 const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
334 if(n2wi!=NULL) {
335 // Avoid duplicate argument checking and support NUL-terminated src.
336 UnicodeString safeMiddle;
337 {
338 ReorderingBuffer buffer(n2wi->impl, firstString);
339 if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1
340 n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL,
341 doNormalize, safeMiddle, buffer, *pErrorCode);
342 }
343 } // The ReorderingBuffer destructor finalizes firstString.
344 if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
345 // Restore the modified suffix of the first string.
346 // This does not restore first[] array contents between firstLength and firstCapacity.
347 // (That might be uninitialized memory, as far as we know.)
348 if(first!=NULL) { /* don't dereference NULL */
349 safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
350 if(firstLength<firstCapacity) {
351 first[firstLength]=0; // NUL-terminate in case it was originally.
352 }
353 }
354 }
355 } else {
356 UnicodeString secondString(secondLength<0, second, secondLength);
357 if(doNormalize) {
358 n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
359 } else {
360 n2->append(firstString, secondString, *pErrorCode);
361 }
362 }
363 }
364 return firstString.extract(first, firstCapacity, *pErrorCode);
365 }
366
367 U_CAPI int32_t U_EXPORT2
unorm2_normalizeSecondAndAppend(const UNormalizer2 * norm2,UChar * first,int32_t firstLength,int32_t firstCapacity,const UChar * second,int32_t secondLength,UErrorCode * pErrorCode)368 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
369 UChar *first, int32_t firstLength, int32_t firstCapacity,
370 const UChar *second, int32_t secondLength,
371 UErrorCode *pErrorCode) {
372 return normalizeSecondAndAppend(norm2,
373 first, firstLength, firstCapacity,
374 second, secondLength,
375 TRUE, pErrorCode);
376 }
377
378 U_CAPI int32_t U_EXPORT2
unorm2_append(const UNormalizer2 * norm2,UChar * first,int32_t firstLength,int32_t firstCapacity,const UChar * second,int32_t secondLength,UErrorCode * pErrorCode)379 unorm2_append(const UNormalizer2 *norm2,
380 UChar *first, int32_t firstLength, int32_t firstCapacity,
381 const UChar *second, int32_t secondLength,
382 UErrorCode *pErrorCode) {
383 return normalizeSecondAndAppend(norm2,
384 first, firstLength, firstCapacity,
385 second, secondLength,
386 FALSE, pErrorCode);
387 }
388
389 U_CAPI int32_t U_EXPORT2
unorm2_getDecomposition(const UNormalizer2 * norm2,UChar32 c,UChar * decomposition,int32_t capacity,UErrorCode * pErrorCode)390 unorm2_getDecomposition(const UNormalizer2 *norm2,
391 UChar32 c, UChar *decomposition, int32_t capacity,
392 UErrorCode *pErrorCode) {
393 if(U_FAILURE(*pErrorCode)) {
394 return 0;
395 }
396 if(decomposition==NULL ? capacity!=0 : capacity<0) {
397 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
398 return 0;
399 }
400 UnicodeString destString(decomposition, 0, capacity);
401 if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
402 return destString.extract(decomposition, capacity, *pErrorCode);
403 } else {
404 return -1;
405 }
406 }
407
408 U_CAPI int32_t U_EXPORT2
unorm2_getRawDecomposition(const UNormalizer2 * norm2,UChar32 c,UChar * decomposition,int32_t capacity,UErrorCode * pErrorCode)409 unorm2_getRawDecomposition(const UNormalizer2 *norm2,
410 UChar32 c, UChar *decomposition, int32_t capacity,
411 UErrorCode *pErrorCode) {
412 if(U_FAILURE(*pErrorCode)) {
413 return 0;
414 }
415 if(decomposition==NULL ? capacity!=0 : capacity<0) {
416 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
417 return 0;
418 }
419 UnicodeString destString(decomposition, 0, capacity);
420 if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
421 return destString.extract(decomposition, capacity, *pErrorCode);
422 } else {
423 return -1;
424 }
425 }
426
427 U_CAPI UChar32 U_EXPORT2
unorm2_composePair(const UNormalizer2 * norm2,UChar32 a,UChar32 b)428 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
429 return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
430 }
431
432 U_CAPI uint8_t U_EXPORT2
unorm2_getCombiningClass(const UNormalizer2 * norm2,UChar32 c)433 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
434 return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
435 }
436
437 U_CAPI UBool U_EXPORT2
unorm2_isNormalized(const UNormalizer2 * norm2,const UChar * s,int32_t length,UErrorCode * pErrorCode)438 unorm2_isNormalized(const UNormalizer2 *norm2,
439 const UChar *s, int32_t length,
440 UErrorCode *pErrorCode) {
441 if(U_FAILURE(*pErrorCode)) {
442 return 0;
443 }
444 if((s==NULL && length!=0) || length<-1) {
445 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
446 return 0;
447 }
448 UnicodeString sString(length<0, s, length);
449 return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
450 }
451
452 U_CAPI UNormalizationCheckResult U_EXPORT2
unorm2_quickCheck(const UNormalizer2 * norm2,const UChar * s,int32_t length,UErrorCode * pErrorCode)453 unorm2_quickCheck(const UNormalizer2 *norm2,
454 const UChar *s, int32_t length,
455 UErrorCode *pErrorCode) {
456 if(U_FAILURE(*pErrorCode)) {
457 return UNORM_NO;
458 }
459 if((s==NULL && length!=0) || length<-1) {
460 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
461 return UNORM_NO;
462 }
463 UnicodeString sString(length<0, s, length);
464 return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
465 }
466
467 U_CAPI int32_t U_EXPORT2
unorm2_spanQuickCheckYes(const UNormalizer2 * norm2,const UChar * s,int32_t length,UErrorCode * pErrorCode)468 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
469 const UChar *s, int32_t length,
470 UErrorCode *pErrorCode) {
471 if(U_FAILURE(*pErrorCode)) {
472 return 0;
473 }
474 if((s==NULL && length!=0) || length<-1) {
475 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
476 return 0;
477 }
478 UnicodeString sString(length<0, s, length);
479 return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
480 }
481
482 U_CAPI UBool U_EXPORT2
unorm2_hasBoundaryBefore(const UNormalizer2 * norm2,UChar32 c)483 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
484 return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
485 }
486
487 U_CAPI UBool U_EXPORT2
unorm2_hasBoundaryAfter(const UNormalizer2 * norm2,UChar32 c)488 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
489 return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
490 }
491
492 U_CAPI UBool U_EXPORT2
unorm2_isInert(const UNormalizer2 * norm2,UChar32 c)493 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
494 return ((const Normalizer2 *)norm2)->isInert(c);
495 }
496
497 // Some properties APIs ---------------------------------------------------- ***
498
499 U_CAPI uint8_t U_EXPORT2
u_getCombiningClass(UChar32 c)500 u_getCombiningClass(UChar32 c) {
501 UErrorCode errorCode=U_ZERO_ERROR;
502 const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
503 if(U_SUCCESS(errorCode)) {
504 return nfd->getCombiningClass(c);
505 } else {
506 return 0;
507 }
508 }
509
510 U_CFUNC uint16_t
unorm_getFCD16(UChar32 c)511 unorm_getFCD16(UChar32 c) {
512 UErrorCode errorCode=U_ZERO_ERROR;
513 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
514 if(U_SUCCESS(errorCode)) {
515 return impl->getFCD16(c);
516 } else {
517 return 0;
518 }
519 }
520
521 #endif // !UCONFIG_NO_NORMALIZATION
522