1 /*
2  **********************************************************************
3  *   Copyright (c) 2001-2011, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  *   Date        Name        Description
7  *   11/19/2001  aliu        Creation.
8  **********************************************************************
9  */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_TRANSLITERATION
14 
15 #include "unicode/uchar.h"
16 #include "unicode/utf16.h"
17 #include "unesctrn.h"
18 #include "util.h"
19 
20 #include "cmemory.h"
21 
22 U_NAMESPACE_BEGIN
23 
24 /**
25  * Special character marking the end of the spec[] array.
26  */
27 static const UChar END = 0xFFFF;
28 
29 // Unicode: "U+10FFFF" hex, min=4, max=6
30 static const UChar SPEC_Unicode[] = {
31     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,
32     END
33 };
34 
35 // Java: "\\uFFFF" hex, min=4, max=4
36 static const UChar SPEC_Java[] = {
37     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
38     END
39 };
40 
41 // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
42 static const UChar SPEC_C[] = {
43     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,
44     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,
45     END
46 };
47 
48 // XML: "" hex, min=1, max=6
49 static const UChar SPEC_XML[] = {
50     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,
51     END
52 };
53 
54 // XML10: "" dec, min=1, max=7 (not really "Hex-Any")
55 static const UChar SPEC_XML10[] = {
56     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,
57     END
58 };
59 
60 // Perl: "\\x{263A}" hex, min=1, max=6
61 static const UChar SPEC_Perl[] = {
62     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/,
63     END
64 };
65 
66 // All: Java, C, Perl, XML, XML10, Unicode
67 static const UChar SPEC_Any[] = {
68     2, 0, 16, 4, 6, 85/*U*/, 43/*+*/,                      // Unicode
69     2, 0, 16, 4, 4, 92/*\*/, 117/*u*/,                     // Java
70     2, 0, 16, 8, 8, 92/*\*/, 85/*U*/,                      // C (surrogates)
71     3, 1, 16, 1, 6, 38/*&*/, 35/*#*/, 120/*x*/, 59/*;*/,   // XML
72     2, 1, 10, 1, 7, 38/*&*/, 35/*#*/, 59/*;*/,             // XML10
73     3, 1, 16, 1, 6, 92/*\*/, 120/*x*/, 123/*{*/, 125/*}*/, // Perl
74     END
75 };
76 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)77 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnescapeTransliterator)
78 
79 static UChar* copySpec(const UChar* spec) {
80     int32_t len = 0;
81     while (spec[len] != END) {
82         ++len;
83     }
84     ++len;
85     UChar *result = (UChar *)uprv_malloc(len*sizeof(UChar));
86     // Check for memory allocation error.
87     if (result != NULL) {
88     	uprv_memcpy(result, spec, len*sizeof(result[0]));
89     }
90     return result;
91 }
92 
93 /**
94  * Factory methods.  Ignore the context.
95  */
_createUnicode(const UnicodeString & ID,Transliterator::Token)96 static Transliterator* _createUnicode(const UnicodeString& ID, Transliterator::Token /*context*/) {
97     return new UnescapeTransliterator(ID, SPEC_Unicode);
98 }
_createJava(const UnicodeString & ID,Transliterator::Token)99 static Transliterator* _createJava(const UnicodeString& ID, Transliterator::Token /*context*/) {
100     return new UnescapeTransliterator(ID, SPEC_Java);
101 }
_createC(const UnicodeString & ID,Transliterator::Token)102 static Transliterator* _createC(const UnicodeString& ID, Transliterator::Token /*context*/) {
103     return new UnescapeTransliterator(ID, SPEC_C);
104 }
_createXML(const UnicodeString & ID,Transliterator::Token)105 static Transliterator* _createXML(const UnicodeString& ID, Transliterator::Token /*context*/) {
106     return new UnescapeTransliterator(ID, SPEC_XML);
107 }
_createXML10(const UnicodeString & ID,Transliterator::Token)108 static Transliterator* _createXML10(const UnicodeString& ID, Transliterator::Token /*context*/) {
109     return new UnescapeTransliterator(ID, SPEC_XML10);
110 }
_createPerl(const UnicodeString & ID,Transliterator::Token)111 static Transliterator* _createPerl(const UnicodeString& ID, Transliterator::Token /*context*/) {
112     return new UnescapeTransliterator(ID, SPEC_Perl);
113 }
_createAny(const UnicodeString & ID,Transliterator::Token)114 static Transliterator* _createAny(const UnicodeString& ID, Transliterator::Token /*context*/) {
115     return new UnescapeTransliterator(ID, SPEC_Any);
116 }
117 
118 /**
119  * Registers standard variants with the system.  Called by
120  * Transliterator during initialization.
121  */
registerIDs()122 void UnescapeTransliterator::registerIDs() {
123     Token t = integerToken(0);
124 
125     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Unicode"), _createUnicode, t);
126 
127     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Java"), _createJava, t);
128 
129     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/C"), _createC, t);
130 
131     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML"), _createXML, t);
132 
133     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/XML10"), _createXML10, t);
134 
135     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any/Perl"), _createPerl, t);
136 
137     Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Hex-Any"), _createAny, t);
138 }
139 
140 /**
141  * Constructor.  Takes the encoded spec array.
142  */
UnescapeTransliterator(const UnicodeString & newID,const UChar * newSpec)143 UnescapeTransliterator::UnescapeTransliterator(const UnicodeString& newID,
144                                                const UChar *newSpec) :
145     Transliterator(newID, NULL)
146 {
147     this->spec = copySpec(newSpec);
148 }
149 
150 /**
151  * Copy constructor.
152  */
UnescapeTransliterator(const UnescapeTransliterator & o)153 UnescapeTransliterator::UnescapeTransliterator(const UnescapeTransliterator& o) :
154     Transliterator(o) {
155     this->spec = copySpec(o.spec);
156 }
157 
~UnescapeTransliterator()158 UnescapeTransliterator::~UnescapeTransliterator() {
159     uprv_free(spec);
160 }
161 
162 /**
163  * Transliterator API.
164  */
clone() const165 Transliterator* UnescapeTransliterator::clone() const {
166     return new UnescapeTransliterator(*this);
167 }
168 
169 /**
170  * Implements {@link Transliterator#handleTransliterate}.
171  */
handleTransliterate(Replaceable & text,UTransPosition & pos,UBool isIncremental) const172 void UnescapeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
173                                                  UBool isIncremental) const {
174     int32_t start = pos.start;
175     int32_t limit = pos.limit;
176     int32_t i, j, ipat;
177 
178     while (start < limit) {
179         // Loop over the forms in spec[].  Exit this loop when we
180         // match one of the specs.  Exit the outer loop if a
181         // partial match is detected and isIncremental is true.
182         for (j=0, ipat=0; spec[ipat] != END; ++j) {
183 
184             // Read the header
185             int32_t prefixLen = spec[ipat++];
186             int32_t suffixLen = spec[ipat++];
187             int8_t  radix     = (int8_t) spec[ipat++];
188             int32_t minDigits = spec[ipat++];
189             int32_t maxDigits = spec[ipat++];
190 
191             // s is a copy of start that is advanced over the
192             // characters as we parse them.
193             int32_t s = start;
194             UBool match = TRUE;
195 
196             for (i=0; i<prefixLen; ++i) {
197                 if (s >= limit) {
198                     if (i > 0) {
199                         // We've already matched a character.  This is
200                         // a partial match, so we return if in
201                         // incremental mode.  In non-incremental mode,
202                         // go to the next spec.
203                         if (isIncremental) {
204                             goto exit;
205                         }
206                         match = FALSE;
207                         break;
208                     }
209                 }
210                 UChar c = text.charAt(s++);
211                 if (c != spec[ipat + i]) {
212                     match = FALSE;
213                     break;
214                 }
215             }
216 
217             if (match) {
218                 UChar32 u = 0;
219                 int32_t digitCount = 0;
220                 for (;;) {
221                     if (s >= limit) {
222                         // Check for partial match in incremental mode.
223                         if (s > start && isIncremental) {
224                             goto exit;
225                         }
226                         break;
227                     }
228                     UChar32 ch = text.char32At(s);
229                     int32_t digit = u_digit(ch, radix);
230                     if (digit < 0) {
231                         break;
232                     }
233                     s += U16_LENGTH(ch);
234                     u = (u * radix) + digit;
235                     if (++digitCount == maxDigits) {
236                         break;
237                     }
238                 }
239 
240                 match = (digitCount >= minDigits);
241 
242                 if (match) {
243                     for (i=0; i<suffixLen; ++i) {
244                         if (s >= limit) {
245                             // Check for partial match in incremental mode.
246                             if (s > start && isIncremental) {
247                                 goto exit;
248                             }
249                             match = FALSE;
250                             break;
251                         }
252                         UChar c = text.charAt(s++);
253                         if (c != spec[ipat + prefixLen + i]) {
254                             match = FALSE;
255                             break;
256                         }
257                     }
258 
259                     if (match) {
260                         // At this point, we have a match
261                         UnicodeString str(u);
262                         text.handleReplaceBetween(start, s, str);
263                         limit -= s - start - str.length();
264                         // The following break statement leaves the
265                         // loop that is traversing the forms in
266                         // spec[].  We then parse the next input
267                         // character.
268                         break;
269                     }
270                 }
271             }
272 
273             ipat += prefixLen + suffixLen;
274         }
275 
276         if (start < limit) {
277             start += U16_LENGTH(text.char32At(start));
278         }
279     }
280 
281   exit:
282     pos.contextLimit += limit - pos.limit;
283     pos.limit = limit;
284     pos.start = start;
285 }
286 
287 U_NAMESPACE_END
288 
289 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
290 
291 //eof
292