1 /*
2 **********************************************************************
3 *   Copyright (C) 2001-2011, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   07/03/01    aliu        Creation.
8 **********************************************************************
9 */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_TRANSLITERATION
14 
15 #include "unicode/normalizer2.h"
16 #include "unicode/utf16.h"
17 #include "cstring.h"
18 #include "nortrans.h"
19 
20 U_NAMESPACE_BEGIN
21 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)22 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
23 
24 static inline Transliterator::Token cstrToken(const char *s) {
25     return Transliterator::pointerToken((void *)s);
26 }
27 
28 /**
29  * System registration hook.
30  */
registerIDs()31 void NormalizationTransliterator::registerIDs() {
32     // In the Token, the byte after the NUL is the UNormalization2Mode.
33     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
34                                      _create, cstrToken("nfc\0\0"));
35     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
36                                      _create, cstrToken("nfkc\0\0"));
37     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
38                                      _create, cstrToken("nfc\0\1"));
39     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
40                                      _create, cstrToken("nfkc\0\1"));
41     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
42                                      _create, cstrToken("nfc\0\2"));
43     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
44                                      _create, cstrToken("nfc\0\3"));
45     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
46                                             UNICODE_STRING_SIMPLE("NFD"), TRUE);
47     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
48                                             UNICODE_STRING_SIMPLE("NFKD"), TRUE);
49     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
50                                             UNICODE_STRING_SIMPLE("NFD"), FALSE);
51     Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
52                                             UNICODE_STRING_SIMPLE("FCD"), FALSE);
53 }
54 
55 /**
56  * Factory methods
57  */
_create(const UnicodeString & ID,Token context)58 Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
59                                                      Token context) {
60     const char *name = (const char *)context.pointer;
61     UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
62     UErrorCode errorCode = U_ZERO_ERROR;
63     const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
64     if(U_SUCCESS(errorCode)) {
65         return new NormalizationTransliterator(ID, *norm2);
66     } else {
67         return NULL;
68     }
69 }
70 
71 /**
72  * Constructs a transliterator.
73  */
NormalizationTransliterator(const UnicodeString & id,const Normalizer2 & norm2)74 NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
75                                                          const Normalizer2 &norm2) :
76     Transliterator(id, 0), fNorm2(norm2) {}
77 
78 /**
79  * Destructor.
80  */
~NormalizationTransliterator()81 NormalizationTransliterator::~NormalizationTransliterator() {
82 }
83 
84 /**
85  * Copy constructor.
86  */
NormalizationTransliterator(const NormalizationTransliterator & o)87 NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
88     Transliterator(o), fNorm2(o.fNorm2) {}
89 
90 /**
91  * Transliterator API.
92  */
clone(void) const93 Transliterator* NormalizationTransliterator::clone(void) const {
94     return new NormalizationTransliterator(*this);
95 }
96 
97 /**
98  * Implements {@link Transliterator#handleTransliterate}.
99  */
handleTransliterate(Replaceable & text,UTransPosition & offsets,UBool isIncremental) const100 void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
101                                                       UBool isIncremental) const {
102     // start and limit of the input range
103     int32_t start = offsets.start;
104     int32_t limit = offsets.limit;
105     if(start >= limit) {
106         return;
107     }
108 
109     /*
110      * Normalize as short chunks at a time as possible even in
111      * bulk mode, so that styled text is minimally disrupted.
112      * In incremental mode, a chunk that ends with offsets.limit
113      * must not be normalized.
114      *
115      * If it was known that the input text is not styled, then
116      * a bulk mode normalization could look like this:
117 
118     UnicodeString input, normalized;
119     int32_t length = limit - start;
120     _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
121     input.releaseBuffer(length);
122 
123     UErrorCode status = U_ZERO_ERROR;
124     fNorm2.normalize(input, normalized, status);
125 
126     text.handleReplaceBetween(start, limit, normalized);
127 
128     int32_t delta = normalized.length() - length;
129     offsets.contextLimit += delta;
130     offsets.limit += delta;
131     offsets.start = limit + delta;
132 
133      */
134     UErrorCode errorCode = U_ZERO_ERROR;
135     UnicodeString segment;
136     UnicodeString normalized;
137     UChar32 c = text.char32At(start);
138     do {
139         int32_t prev = start;
140         // Skip at least one character so we make progress.
141         // c holds the character at start.
142         segment.remove();
143         do {
144             segment.append(c);
145             start += U16_LENGTH(c);
146         } while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
147         if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
148             // stop in incremental mode when we reach the input limit
149             // in case there are additional characters that could change the
150             // normalization result
151             start=prev;
152             break;
153         }
154         fNorm2.normalize(segment, normalized, errorCode);
155         if(U_FAILURE(errorCode)) {
156             break;
157         }
158         if(segment != normalized) {
159             // replace the input chunk with its normalized form
160             text.handleReplaceBetween(prev, start, normalized);
161 
162             // update all necessary indexes accordingly
163             int32_t delta = normalized.length() - (start - prev);
164             start += delta;
165             limit += delta;
166         }
167     } while(start < limit);
168 
169     offsets.start = start;
170     offsets.contextLimit += limit - offsets.limit;
171     offsets.limit = limit;
172 }
173 
174 U_NAMESPACE_END
175 
176 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
177