1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2009-2014, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  normalizer2.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2009nov22
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 
19 #if !UCONFIG_NO_NORMALIZATION
20 
21 #include "unicode/normalizer2.h"
22 #include "unicode/unistr.h"
23 #include "unicode/unorm.h"
24 #include "cstring.h"
25 #include "mutex.h"
26 #include "norm2allmodes.h"
27 #include "normalizer2impl.h"
28 #include "uassert.h"
29 #include "ucln_cmn.h"
30 
31 using icu::Normalizer2Impl;
32 
33 // NFC/NFD data machine-generated by gennorm2 --csource
34 #include "norm2_nfc_data.h"
35 
36 U_NAMESPACE_BEGIN
37 
38 // Public API dispatch via Normalizer2 subclasses -------------------------- ***
39 
~Normalizer2()40 Normalizer2::~Normalizer2() {}
41 
42 UBool
getRawDecomposition(UChar32,UnicodeString &) const43 Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
44     return FALSE;
45 }
46 
47 UChar32
composePair(UChar32,UChar32) const48 Normalizer2::composePair(UChar32, UChar32) const {
49     return U_SENTINEL;
50 }
51 
52 uint8_t
getCombiningClass(UChar32) const53 Normalizer2::getCombiningClass(UChar32 /*c*/) const {
54     return 0;
55 }
56 
57 // Normalizer2 implementation for the old UNORM_NONE.
58 class NoopNormalizer2 : public Normalizer2 {
59     virtual ~NoopNormalizer2();
60 
61     virtual UnicodeString &
normalize(const UnicodeString & src,UnicodeString & dest,UErrorCode & errorCode) const62     normalize(const UnicodeString &src,
63               UnicodeString &dest,
64               UErrorCode &errorCode) const {
65         if(U_SUCCESS(errorCode)) {
66             if(&dest!=&src) {
67                 dest=src;
68             } else {
69                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
70             }
71         }
72         return dest;
73     }
74     virtual UnicodeString &
normalizeSecondAndAppend(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const75     normalizeSecondAndAppend(UnicodeString &first,
76                              const UnicodeString &second,
77                              UErrorCode &errorCode) const {
78         if(U_SUCCESS(errorCode)) {
79             if(&first!=&second) {
80                 first.append(second);
81             } else {
82                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
83             }
84         }
85         return first;
86     }
87     virtual UnicodeString &
append(UnicodeString & first,const UnicodeString & second,UErrorCode & errorCode) const88     append(UnicodeString &first,
89            const UnicodeString &second,
90            UErrorCode &errorCode) const {
91         if(U_SUCCESS(errorCode)) {
92             if(&first!=&second) {
93                 first.append(second);
94             } else {
95                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
96             }
97         }
98         return first;
99     }
100     virtual UBool
getDecomposition(UChar32,UnicodeString &) const101     getDecomposition(UChar32, UnicodeString &) const {
102         return FALSE;
103     }
104     // No need to override the default getRawDecomposition().
105     virtual UBool
isNormalized(const UnicodeString &,UErrorCode &) const106     isNormalized(const UnicodeString &, UErrorCode &) const {
107         return TRUE;
108     }
109     virtual UNormalizationCheckResult
quickCheck(const UnicodeString &,UErrorCode &) const110     quickCheck(const UnicodeString &, UErrorCode &) const {
111         return UNORM_YES;
112     }
113     virtual int32_t
spanQuickCheckYes(const UnicodeString & s,UErrorCode &) const114     spanQuickCheckYes(const UnicodeString &s, UErrorCode &) const {
115         return s.length();
116     }
hasBoundaryBefore(UChar32) const117     virtual UBool hasBoundaryBefore(UChar32) const { return TRUE; }
hasBoundaryAfter(UChar32) const118     virtual UBool hasBoundaryAfter(UChar32) const { return TRUE; }
isInert(UChar32) const119     virtual UBool isInert(UChar32) const { return TRUE; }
120 };
121 
~NoopNormalizer2()122 NoopNormalizer2::~NoopNormalizer2() {}
123 
~Normalizer2WithImpl()124 Normalizer2WithImpl::~Normalizer2WithImpl() {}
125 
~DecomposeNormalizer2()126 DecomposeNormalizer2::~DecomposeNormalizer2() {}
127 
~ComposeNormalizer2()128 ComposeNormalizer2::~ComposeNormalizer2() {}
129 
~FCDNormalizer2()130 FCDNormalizer2::~FCDNormalizer2() {}
131 
132 // instance cache ---------------------------------------------------------- ***
133 
~Norm2AllModes()134 Norm2AllModes::~Norm2AllModes() {
135     delete impl;
136 }
137 
138 Norm2AllModes *
createInstance(Normalizer2Impl * impl,UErrorCode & errorCode)139 Norm2AllModes::createInstance(Normalizer2Impl *impl, UErrorCode &errorCode) {
140     if(U_FAILURE(errorCode)) {
141         delete impl;
142         return NULL;
143     }
144     Norm2AllModes *allModes=new Norm2AllModes(impl);
145     if(allModes==NULL) {
146         errorCode=U_MEMORY_ALLOCATION_ERROR;
147         delete impl;
148         return NULL;
149     }
150     return allModes;
151 }
152 
153 Norm2AllModes *
createNFCInstance(UErrorCode & errorCode)154 Norm2AllModes::createNFCInstance(UErrorCode &errorCode) {
155     if(U_FAILURE(errorCode)) {
156         return NULL;
157     }
158     Normalizer2Impl *impl=new Normalizer2Impl;
159     if(impl==NULL) {
160         errorCode=U_MEMORY_ALLOCATION_ERROR;
161         return NULL;
162     }
163     impl->init(norm2_nfc_data_indexes, &norm2_nfc_data_trie,
164                norm2_nfc_data_extraData, norm2_nfc_data_smallFCD);
165     return createInstance(impl, errorCode);
166 }
167 
168 U_CDECL_BEGIN
169 static UBool U_CALLCONV uprv_normalizer2_cleanup();
170 U_CDECL_END
171 
172 static Norm2AllModes *nfcSingleton;
173 static Normalizer2   *noopSingleton;
174 
175 static icu::UInitOnce nfcInitOnce = U_INITONCE_INITIALIZER;
176 static icu::UInitOnce noopInitOnce = U_INITONCE_INITIALIZER;
177 
178 // UInitOnce singleton initialization functions
initNFCSingleton(UErrorCode & errorCode)179 static void U_CALLCONV initNFCSingleton(UErrorCode &errorCode) {
180     nfcSingleton=Norm2AllModes::createNFCInstance(errorCode);
181     ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
182 }
183 
initNoopSingleton(UErrorCode & errorCode)184 static void U_CALLCONV initNoopSingleton(UErrorCode &errorCode) {
185     if(U_FAILURE(errorCode)) {
186         return;
187     }
188     noopSingleton=new NoopNormalizer2;
189     if(noopSingleton==NULL) {
190         errorCode=U_MEMORY_ALLOCATION_ERROR;
191         return;
192     }
193     ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
194 }
195 
196 U_CDECL_BEGIN
197 
uprv_normalizer2_cleanup()198 static UBool U_CALLCONV uprv_normalizer2_cleanup() {
199     delete nfcSingleton;
200     nfcSingleton = NULL;
201     delete noopSingleton;
202     noopSingleton = NULL;
203     nfcInitOnce.reset();
204     noopInitOnce.reset();
205     return TRUE;
206 }
207 
208 U_CDECL_END
209 
210 const Norm2AllModes *
getNFCInstance(UErrorCode & errorCode)211 Norm2AllModes::getNFCInstance(UErrorCode &errorCode) {
212     if(U_FAILURE(errorCode)) { return NULL; }
213     umtx_initOnce(nfcInitOnce, &initNFCSingleton, errorCode);
214     return nfcSingleton;
215 }
216 
217 const Normalizer2 *
getNFCInstance(UErrorCode & errorCode)218 Normalizer2::getNFCInstance(UErrorCode &errorCode) {
219     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
220     return allModes!=NULL ? &allModes->comp : NULL;
221 }
222 
223 const Normalizer2 *
getNFDInstance(UErrorCode & errorCode)224 Normalizer2::getNFDInstance(UErrorCode &errorCode) {
225     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
226     return allModes!=NULL ? &allModes->decomp : NULL;
227 }
228 
getFCDInstance(UErrorCode & errorCode)229 const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
230     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
231     return allModes!=NULL ? &allModes->fcd : NULL;
232 }
233 
getFCCInstance(UErrorCode & errorCode)234 const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
235     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
236     return allModes!=NULL ? &allModes->fcc : NULL;
237 }
238 
getNoopInstance(UErrorCode & errorCode)239 const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
240     if(U_FAILURE(errorCode)) { return NULL; }
241     umtx_initOnce(noopInitOnce, &initNoopSingleton, errorCode);
242     return noopSingleton;
243 }
244 
245 const Normalizer2Impl *
getNFCImpl(UErrorCode & errorCode)246 Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
247     const Norm2AllModes *allModes=Norm2AllModes::getNFCInstance(errorCode);
248     return allModes!=NULL ? allModes->impl : NULL;
249 }
250 
251 const Normalizer2Impl *
getImpl(const Normalizer2 * norm2)252 Normalizer2Factory::getImpl(const Normalizer2 *norm2) {
253     return &((Normalizer2WithImpl *)norm2)->impl;
254 }
255 
256 U_NAMESPACE_END
257 
258 // C API ------------------------------------------------------------------- ***
259 
260 U_NAMESPACE_USE
261 
262 U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFCInstance(UErrorCode * pErrorCode)263 unorm2_getNFCInstance(UErrorCode *pErrorCode) {
264     return (const UNormalizer2 *)Normalizer2::getNFCInstance(*pErrorCode);
265 }
266 
267 U_CAPI const UNormalizer2 * U_EXPORT2
unorm2_getNFDInstance(UErrorCode * pErrorCode)268 unorm2_getNFDInstance(UErrorCode *pErrorCode) {
269     return (const UNormalizer2 *)Normalizer2::getNFDInstance(*pErrorCode);
270 }
271 
272 U_CAPI void U_EXPORT2
unorm2_close(UNormalizer2 * norm2)273 unorm2_close(UNormalizer2 *norm2) {
274     delete (Normalizer2 *)norm2;
275 }
276 
277 U_CAPI int32_t U_EXPORT2
unorm2_normalize(const UNormalizer2 * norm2,const UChar * src,int32_t length,UChar * dest,int32_t capacity,UErrorCode * pErrorCode)278 unorm2_normalize(const UNormalizer2 *norm2,
279                  const UChar *src, int32_t length,
280                  UChar *dest, int32_t capacity,
281                  UErrorCode *pErrorCode) {
282     if(U_FAILURE(*pErrorCode)) {
283         return 0;
284     }
285     if( (src==NULL ? length!=0 : length<-1) ||
286         (dest==NULL ? capacity!=0 : capacity<0) ||
287         (src==dest && src!=NULL)
288     ) {
289         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
290         return 0;
291     }
292     UnicodeString destString(dest, 0, capacity);
293     // length==0: Nothing to do, and n2wi->normalize(NULL, NULL, buffer, ...) would crash.
294     if(length!=0) {
295         const Normalizer2 *n2=(const Normalizer2 *)norm2;
296         const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
297         if(n2wi!=NULL) {
298             // Avoid duplicate argument checking and support NUL-terminated src.
299             ReorderingBuffer buffer(n2wi->impl, destString);
300             if(buffer.init(length, *pErrorCode)) {
301                 n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode);
302             }
303         } else {
304             UnicodeString srcString(length<0, src, length);
305             n2->normalize(srcString, destString, *pErrorCode);
306         }
307     }
308     return destString.extract(dest, capacity, *pErrorCode);
309 }
310 
311 static int32_t
normalizeSecondAndAppend(const UNormalizer2 * norm2,UChar * first,int32_t firstLength,int32_t firstCapacity,const UChar * second,int32_t secondLength,UBool doNormalize,UErrorCode * pErrorCode)312 normalizeSecondAndAppend(const UNormalizer2 *norm2,
313                          UChar *first, int32_t firstLength, int32_t firstCapacity,
314                          const UChar *second, int32_t secondLength,
315                          UBool doNormalize,
316                          UErrorCode *pErrorCode) {
317     if(U_FAILURE(*pErrorCode)) {
318         return 0;
319     }
320     if( (second==NULL ? secondLength!=0 : secondLength<-1) ||
321         (first==NULL ? (firstCapacity!=0 || firstLength!=0) :
322                        (firstCapacity<0 || firstLength<-1)) ||
323         (first==second && first!=NULL)
324     ) {
325         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
326         return 0;
327     }
328     UnicodeString firstString(first, firstLength, firstCapacity);
329     firstLength=firstString.length();  // In case it was -1.
330     // secondLength==0: Nothing to do, and n2wi->normalizeAndAppend(NULL, NULL, buffer, ...) would crash.
331     if(secondLength!=0) {
332         const Normalizer2 *n2=(const Normalizer2 *)norm2;
333         const Normalizer2WithImpl *n2wi=dynamic_cast<const Normalizer2WithImpl *>(n2);
334         if(n2wi!=NULL) {
335             // Avoid duplicate argument checking and support NUL-terminated src.
336             UnicodeString safeMiddle;
337             {
338                 ReorderingBuffer buffer(n2wi->impl, firstString);
339                 if(buffer.init(firstLength+secondLength+1, *pErrorCode)) {  // destCapacity>=-1
340                     n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL,
341                                              doNormalize, safeMiddle, buffer, *pErrorCode);
342                 }
343             }  // The ReorderingBuffer destructor finalizes firstString.
344             if(U_FAILURE(*pErrorCode) || firstString.length()>firstCapacity) {
345                 // Restore the modified suffix of the first string.
346                 // This does not restore first[] array contents between firstLength and firstCapacity.
347                 // (That might be uninitialized memory, as far as we know.)
348                 if(first!=NULL) { /* don't dereference NULL */
349                   safeMiddle.extract(0, 0x7fffffff, first+firstLength-safeMiddle.length());
350                   if(firstLength<firstCapacity) {
351                     first[firstLength]=0;  // NUL-terminate in case it was originally.
352                   }
353                 }
354             }
355         } else {
356             UnicodeString secondString(secondLength<0, second, secondLength);
357             if(doNormalize) {
358                 n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
359             } else {
360                 n2->append(firstString, secondString, *pErrorCode);
361             }
362         }
363     }
364     return firstString.extract(first, firstCapacity, *pErrorCode);
365 }
366 
367 U_CAPI int32_t U_EXPORT2
unorm2_normalizeSecondAndAppend(const UNormalizer2 * norm2,UChar * first,int32_t firstLength,int32_t firstCapacity,const UChar * second,int32_t secondLength,UErrorCode * pErrorCode)368 unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
369                                 UChar *first, int32_t firstLength, int32_t firstCapacity,
370                                 const UChar *second, int32_t secondLength,
371                                 UErrorCode *pErrorCode) {
372     return normalizeSecondAndAppend(norm2,
373                                     first, firstLength, firstCapacity,
374                                     second, secondLength,
375                                     TRUE, pErrorCode);
376 }
377 
378 U_CAPI int32_t U_EXPORT2
unorm2_append(const UNormalizer2 * norm2,UChar * first,int32_t firstLength,int32_t firstCapacity,const UChar * second,int32_t secondLength,UErrorCode * pErrorCode)379 unorm2_append(const UNormalizer2 *norm2,
380               UChar *first, int32_t firstLength, int32_t firstCapacity,
381               const UChar *second, int32_t secondLength,
382               UErrorCode *pErrorCode) {
383     return normalizeSecondAndAppend(norm2,
384                                     first, firstLength, firstCapacity,
385                                     second, secondLength,
386                                     FALSE, pErrorCode);
387 }
388 
389 U_CAPI int32_t U_EXPORT2
unorm2_getDecomposition(const UNormalizer2 * norm2,UChar32 c,UChar * decomposition,int32_t capacity,UErrorCode * pErrorCode)390 unorm2_getDecomposition(const UNormalizer2 *norm2,
391                         UChar32 c, UChar *decomposition, int32_t capacity,
392                         UErrorCode *pErrorCode) {
393     if(U_FAILURE(*pErrorCode)) {
394         return 0;
395     }
396     if(decomposition==NULL ? capacity!=0 : capacity<0) {
397         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
398         return 0;
399     }
400     UnicodeString destString(decomposition, 0, capacity);
401     if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
402         return destString.extract(decomposition, capacity, *pErrorCode);
403     } else {
404         return -1;
405     }
406 }
407 
408 U_CAPI int32_t U_EXPORT2
unorm2_getRawDecomposition(const UNormalizer2 * norm2,UChar32 c,UChar * decomposition,int32_t capacity,UErrorCode * pErrorCode)409 unorm2_getRawDecomposition(const UNormalizer2 *norm2,
410                            UChar32 c, UChar *decomposition, int32_t capacity,
411                            UErrorCode *pErrorCode) {
412     if(U_FAILURE(*pErrorCode)) {
413         return 0;
414     }
415     if(decomposition==NULL ? capacity!=0 : capacity<0) {
416         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
417         return 0;
418     }
419     UnicodeString destString(decomposition, 0, capacity);
420     if(reinterpret_cast<const Normalizer2 *>(norm2)->getRawDecomposition(c, destString)) {
421         return destString.extract(decomposition, capacity, *pErrorCode);
422     } else {
423         return -1;
424     }
425 }
426 
427 U_CAPI UChar32 U_EXPORT2
unorm2_composePair(const UNormalizer2 * norm2,UChar32 a,UChar32 b)428 unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
429     return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
430 }
431 
432 U_CAPI uint8_t U_EXPORT2
unorm2_getCombiningClass(const UNormalizer2 * norm2,UChar32 c)433 unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
434     return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
435 }
436 
437 U_CAPI UBool U_EXPORT2
unorm2_isNormalized(const UNormalizer2 * norm2,const UChar * s,int32_t length,UErrorCode * pErrorCode)438 unorm2_isNormalized(const UNormalizer2 *norm2,
439                     const UChar *s, int32_t length,
440                     UErrorCode *pErrorCode) {
441     if(U_FAILURE(*pErrorCode)) {
442         return 0;
443     }
444     if((s==NULL && length!=0) || length<-1) {
445         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
446         return 0;
447     }
448     UnicodeString sString(length<0, s, length);
449     return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
450 }
451 
452 U_CAPI UNormalizationCheckResult U_EXPORT2
unorm2_quickCheck(const UNormalizer2 * norm2,const UChar * s,int32_t length,UErrorCode * pErrorCode)453 unorm2_quickCheck(const UNormalizer2 *norm2,
454                   const UChar *s, int32_t length,
455                   UErrorCode *pErrorCode) {
456     if(U_FAILURE(*pErrorCode)) {
457         return UNORM_NO;
458     }
459     if((s==NULL && length!=0) || length<-1) {
460         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
461         return UNORM_NO;
462     }
463     UnicodeString sString(length<0, s, length);
464     return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
465 }
466 
467 U_CAPI int32_t U_EXPORT2
unorm2_spanQuickCheckYes(const UNormalizer2 * norm2,const UChar * s,int32_t length,UErrorCode * pErrorCode)468 unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
469                          const UChar *s, int32_t length,
470                          UErrorCode *pErrorCode) {
471     if(U_FAILURE(*pErrorCode)) {
472         return 0;
473     }
474     if((s==NULL && length!=0) || length<-1) {
475         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
476         return 0;
477     }
478     UnicodeString sString(length<0, s, length);
479     return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
480 }
481 
482 U_CAPI UBool U_EXPORT2
unorm2_hasBoundaryBefore(const UNormalizer2 * norm2,UChar32 c)483 unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
484     return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
485 }
486 
487 U_CAPI UBool U_EXPORT2
unorm2_hasBoundaryAfter(const UNormalizer2 * norm2,UChar32 c)488 unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
489     return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
490 }
491 
492 U_CAPI UBool U_EXPORT2
unorm2_isInert(const UNormalizer2 * norm2,UChar32 c)493 unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
494     return ((const Normalizer2 *)norm2)->isInert(c);
495 }
496 
497 // Some properties APIs ---------------------------------------------------- ***
498 
499 U_CAPI uint8_t U_EXPORT2
u_getCombiningClass(UChar32 c)500 u_getCombiningClass(UChar32 c) {
501     UErrorCode errorCode=U_ZERO_ERROR;
502     const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode);
503     if(U_SUCCESS(errorCode)) {
504         return nfd->getCombiningClass(c);
505     } else {
506         return 0;
507     }
508 }
509 
510 U_CFUNC uint16_t
unorm_getFCD16(UChar32 c)511 unorm_getFCD16(UChar32 c) {
512     UErrorCode errorCode=U_ZERO_ERROR;
513     const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
514     if(U_SUCCESS(errorCode)) {
515         return impl->getFCD16(c);
516     } else {
517         return 0;
518     }
519 }
520 
521 #endif  // !UCONFIG_NO_NORMALIZATION
522