1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 
4 // norms.cpp
5 // created: 2017jun04 Markus W. Scherer
6 // (pulled out of n2builder.cpp)
7 
8 #include "unicode/utypes.h"
9 
10 #if !UCONFIG_NO_NORMALIZATION
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include "unicode/errorcode.h"
15 #include "unicode/umutablecptrie.h"
16 #include "unicode/unistr.h"
17 #include "unicode/utf16.h"
18 #include "normalizer2impl.h"
19 #include "norms.h"
20 #include "toolutil.h"
21 #include "uvectr32.h"
22 
23 U_NAMESPACE_BEGIN
24 
append(UChar32 c,uint8_t cc)25 void BuilderReorderingBuffer::append(UChar32 c, uint8_t cc) {
26     if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
27         if(cc==0) {
28             fLastStarterIndex=fLength;
29         }
30         fArray[fLength++]=(c<<8)|cc;
31         return;
32     }
33     // Let this character bubble back to its canonical order.
34     int32_t i=fLength-1;
35     while(i>fLastStarterIndex && ccAt(i)>cc) {
36         --i;
37     }
38     ++i;  // after the last starter or prevCC<=cc
39     // Move this and the following characters forward one to make space.
40     for(int32_t j=fLength; i<j; --j) {
41         fArray[j]=fArray[j-1];
42     }
43     fArray[i]=(c<<8)|cc;
44     ++fLength;
45     fDidReorder=TRUE;
46 }
47 
toString(UnicodeString & dest) const48 void BuilderReorderingBuffer::toString(UnicodeString &dest) const {
49     dest.remove();
50     for(int32_t i=0; i<fLength; ++i) {
51         dest.append(charAt(i));
52     }
53 }
54 
combine(UChar32 trail) const55 UChar32 Norm::combine(UChar32 trail) const {
56     int32_t length;
57     const CompositionPair *pairs=getCompositionPairs(length);
58     for(int32_t i=0; i<length; ++i) {
59         if(trail==pairs[i].trail) {
60             return pairs[i].composite;
61         }
62         if(trail<pairs[i].trail) {
63             break;
64         }
65     }
66     return U_SENTINEL;
67 }
68 
Norms(UErrorCode & errorCode)69 Norms::Norms(UErrorCode &errorCode) {
70     normTrie = umutablecptrie_open(0, 0, &errorCode);
71     normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
72     // Default "inert" Norm struct at index 0. Practically immutable.
73     norms=allocNorm();
74     norms->type=Norm::INERT;
75 }
76 
~Norms()77 Norms::~Norms() {
78     umutablecptrie_close(normTrie);
79     int32_t normsLength=utm_countItems(normMem);
80     for(int32_t i=1; i<normsLength; ++i) {
81         delete norms[i].mapping;
82         delete norms[i].rawMapping;
83         delete norms[i].compositions;
84     }
85     utm_close(normMem);
86 }
87 
allocNorm()88 Norm *Norms::allocNorm() {
89     Norm *p=(Norm *)utm_alloc(normMem);
90     norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
91     return p;
92 }
93 
getNorm(UChar32 c)94 Norm *Norms::getNorm(UChar32 c) {
95     uint32_t i = umutablecptrie_get(normTrie, c);
96     if(i==0) {
97         return nullptr;
98     }
99     return norms+i;
100 }
101 
getNorm(UChar32 c) const102 const Norm *Norms::getNorm(UChar32 c) const {
103     uint32_t i = umutablecptrie_get(normTrie, c);
104     if(i==0) {
105         return nullptr;
106     }
107     return norms+i;
108 }
109 
getNormRef(UChar32 c) const110 const Norm &Norms::getNormRef(UChar32 c) const {
111     return norms[umutablecptrie_get(normTrie, c)];
112 }
113 
createNorm(UChar32 c)114 Norm *Norms::createNorm(UChar32 c) {
115     uint32_t i=umutablecptrie_get(normTrie, c);
116     if(i!=0) {
117         return norms+i;
118     } else {
119         /* allocate Norm */
120         Norm *p=allocNorm();
121         IcuToolErrorCode errorCode("gennorm2/createNorm()");
122         umutablecptrie_set(normTrie, c, (uint32_t)(p - norms), errorCode);
123         return p;
124     }
125 }
126 
reorder(UnicodeString & mapping,BuilderReorderingBuffer & buffer) const127 void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const {
128     int32_t length=mapping.length();
129     U_ASSERT(length<=Normalizer2Impl::MAPPING_LENGTH_MASK);
130     const char16_t *s=mapping.getBuffer();
131     int32_t i=0;
132     UChar32 c;
133     while(i<length) {
134         U16_NEXT(s, i, length, c);
135         buffer.append(c, getCC(c));
136     }
137     if(buffer.didReorder()) {
138         buffer.toString(mapping);
139     }
140 }
141 
combinesWithCCBetween(const Norm & norm,uint8_t lowCC,int32_t highCC) const142 UBool Norms::combinesWithCCBetween(const Norm &norm, uint8_t lowCC, int32_t highCC) const {
143     if((highCC-lowCC)>=2) {
144         int32_t length;
145         const CompositionPair *pairs=norm.getCompositionPairs(length);
146         for(int32_t i=0; i<length; ++i) {
147             uint8_t trailCC=getCC(pairs[i].trail);
148             if(lowCC<trailCC && trailCC<highCC) {
149                 return TRUE;
150             }
151         }
152     }
153     return FALSE;
154 }
155 
enumRanges(Enumerator & e)156 void Norms::enumRanges(Enumerator &e) {
157     UChar32 start = 0, end;
158     uint32_t i;
159     while ((end = umutablecptrie_getRange(normTrie, start, UCPMAP_RANGE_NORMAL, 0,
160                                           nullptr, nullptr, &i)) >= 0) {
161         if (i > 0) {
162             e.rangeHandler(start, end, norms[i]);
163         }
164         start = end + 1;
165     }
166 }
167 
~Enumerator()168 Norms::Enumerator::~Enumerator() {}
169 
rangeHandler(UChar32 start,UChar32 end,Norm & norm)170 void CompositionBuilder::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
171     if(norm.mappingType!=Norm::ROUND_TRIP) { return; }
172     if(start!=end) {
173         fprintf(stderr,
174                 "gennorm2 error: same round-trip mapping for "
175                 "more than 1 code point U+%04lX..U+%04lX\n",
176                 (long)start, (long)end);
177         exit(U_INVALID_FORMAT_ERROR);
178     }
179     if(norm.cc!=0) {
180         fprintf(stderr,
181                 "gennorm2 error: "
182                 "U+%04lX has a round-trip mapping and ccc!=0, "
183                 "not possible in Unicode normalization\n",
184                 (long)start);
185         exit(U_INVALID_FORMAT_ERROR);
186     }
187     // setRoundTripMapping() ensured that there are exactly two code points.
188     const UnicodeString &m=*norm.mapping;
189     UChar32 lead=m.char32At(0);
190     UChar32 trail=m.char32At(m.length()-1);
191     if(norms.getCC(lead)!=0) {
192         fprintf(stderr,
193                 "gennorm2 error: "
194                 "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
195                 "not possible in Unicode normalization\n",
196                 (long)start, (long)lead);
197         exit(U_INVALID_FORMAT_ERROR);
198     }
199     // Flag for trailing character.
200     norms.createNorm(trail)->combinesBack=TRUE;
201     // Insert (trail, composite) pair into compositions list for the lead character.
202     IcuToolErrorCode errorCode("gennorm2/addComposition()");
203     Norm *leadNorm=norms.createNorm(lead);
204     UVector32 *compositions=leadNorm->compositions;
205     int32_t i;
206     if(compositions==nullptr) {
207         compositions=leadNorm->compositions=new UVector32(errorCode);
208         i=0;  // "insert" the first pair at index 0
209     } else {
210         // Insertion sort, and check for duplicate trail characters.
211         int32_t length;
212         const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
213         for(i=0; i<length; ++i) {
214             if(trail==pairs[i].trail) {
215                 fprintf(stderr,
216                         "gennorm2 error: same round-trip mapping for "
217                         "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
218                         (long)start, (long)lead, (long)trail);
219                 exit(U_INVALID_FORMAT_ERROR);
220             }
221             if(trail<pairs[i].trail) {
222                 break;
223             }
224         }
225     }
226     compositions->insertElementAt(trail, 2*i, errorCode);
227     compositions->insertElementAt(start, 2*i+1, errorCode);
228 }
229 
rangeHandler(UChar32 start,UChar32 end,Norm & norm)230 void Decomposer::rangeHandler(UChar32 start, UChar32 end, Norm &norm) {
231     if(!norm.hasMapping()) { return; }
232     const UnicodeString &m=*norm.mapping;
233     UnicodeString *decomposed=nullptr;
234     const UChar *s=toUCharPtr(m.getBuffer());
235     int32_t length=m.length();
236     int32_t prev, i=0;
237     UChar32 c;
238     while(i<length) {
239         prev=i;
240         U16_NEXT(s, i, length, c);
241         if(start<=c && c<=end) {
242             fprintf(stderr,
243                     "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
244                     (long)c);
245             exit(U_INVALID_FORMAT_ERROR);
246         }
247         const Norm &cNorm=norms.getNormRef(c);
248         if(cNorm.hasMapping()) {
249             if(norm.mappingType==Norm::ROUND_TRIP) {
250                 if(prev==0) {
251                     if(cNorm.mappingType!=Norm::ROUND_TRIP) {
252                         fprintf(stderr,
253                                 "gennorm2 error: "
254                                 "U+%04lX's round-trip mapping's starter "
255                                 "U+%04lX one-way-decomposes, "
256                                 "not possible in Unicode normalization\n",
257                                 (long)start, (long)c);
258                         exit(U_INVALID_FORMAT_ERROR);
259                     }
260                     uint8_t myTrailCC=norms.getCC(m.char32At(i));
261                     UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
262                     uint8_t cTrailCC=norms.getCC(cTrailChar);
263                     if(cTrailCC>myTrailCC) {
264                         fprintf(stderr,
265                                 "gennorm2 error: "
266                                 "U+%04lX's round-trip mapping's starter "
267                                 "U+%04lX decomposes and the "
268                                 "inner/earlier tccc=%hu > outer/following tccc=%hu, "
269                                 "not possible in Unicode normalization\n",
270                                 (long)start, (long)c,
271                                 (short)cTrailCC, (short)myTrailCC);
272                         exit(U_INVALID_FORMAT_ERROR);
273                     }
274                 } else {
275                     fprintf(stderr,
276                             "gennorm2 error: "
277                             "U+%04lX's round-trip mapping's non-starter "
278                             "U+%04lX decomposes, "
279                             "not possible in Unicode normalization\n",
280                             (long)start, (long)c);
281                     exit(U_INVALID_FORMAT_ERROR);
282                 }
283             }
284             if(decomposed==nullptr) {
285                 decomposed=new UnicodeString(m, 0, prev);
286             }
287             decomposed->append(*cNorm.mapping);
288         } else if(Hangul::isHangul(c)) {
289             UChar buffer[3];
290             int32_t hangulLength=Hangul::decompose(c, buffer);
291             if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
292                 fprintf(stderr,
293                         "gennorm2 error: "
294                         "U+%04lX's round-trip mapping's non-starter "
295                         "U+%04lX decomposes, "
296                         "not possible in Unicode normalization\n",
297                         (long)start, (long)c);
298                 exit(U_INVALID_FORMAT_ERROR);
299             }
300             if(decomposed==nullptr) {
301                 decomposed=new UnicodeString(m, 0, prev);
302             }
303             decomposed->append(buffer, hangulLength);
304         } else if(decomposed!=nullptr) {
305             decomposed->append(m, prev, i-prev);
306         }
307     }
308     if(decomposed!=nullptr) {
309         if(norm.rawMapping==nullptr) {
310             // Remember the original mapping when decomposing recursively.
311             norm.rawMapping=norm.mapping;
312         } else {
313             delete norm.mapping;
314         }
315         norm.mapping=decomposed;
316         // Not  norm.setMappingCP();  because the original mapping
317         // is most likely to be encodable as a delta.
318         didDecompose|=TRUE;
319     }
320 }
321 
322 U_NAMESPACE_END
323 
324 #endif // #if !UCONFIG_NO_NORMALIZATION
325