1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2009-2016, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  n2builder.cpp
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2009nov25
16 *   created by: Markus W. Scherer
17 *
18 * Builds Normalizer2 data and writes a binary .nrm file.
19 * For the file format see source/common/normalizer2impl.h.
20 */
21 
22 #include "unicode/utypes.h"
23 #include "n2builder.h"
24 
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #if U_HAVE_STD_STRING
29 #include <vector>
30 #endif
31 #include "unicode/errorcode.h"
32 #include "unicode/localpointer.h"
33 #include "unicode/putil.h"
34 #include "unicode/udata.h"
35 #include "unicode/uniset.h"
36 #include "unicode/unistr.h"
37 #include "unicode/ustring.h"
38 #include "charstr.h"
39 #include "hash.h"
40 #include "normalizer2impl.h"
41 #include "toolutil.h"
42 #include "unewdata.h"
43 #include "utrie2.h"
44 #include "uvectr32.h"
45 #include "writesrc.h"
46 
47 #if !UCONFIG_NO_NORMALIZATION
48 
49 /* UDataInfo cf. udata.h */
50 static UDataInfo dataInfo={
51     sizeof(UDataInfo),
52     0,
53 
54     U_IS_BIG_ENDIAN,
55     U_CHARSET_FAMILY,
56     U_SIZEOF_UCHAR,
57     0,
58 
59     { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
60     { 2, 0, 0, 0 },             /* formatVersion */
61     { 5, 2, 0, 0 }              /* dataVersion (Unicode version) */
62 };
63 
64 U_NAMESPACE_BEGIN
65 
66 class HangulIterator {
67 public:
68     struct Range {
69         UChar32 start, limit;
70         uint16_t norm16;
71     };
72 
HangulIterator()73     HangulIterator() : rangeIndex(0) {}
nextRange()74     const Range *nextRange() {
75         if(rangeIndex<UPRV_LENGTHOF(ranges)) {
76             return ranges+rangeIndex++;
77         } else {
78             return NULL;
79         }
80     }
reset()81     void reset() { rangeIndex=0; }
82 private:
83     static const Range ranges[4];
84     int32_t rangeIndex;
85 };
86 
87 const HangulIterator::Range HangulIterator::ranges[4]={
88     { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 },
89     { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT },
90     // JAMO_T_BASE+1: not U+11A7
91     { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT },
92     { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 },  // will become minYesNo
93 };
94 
95 struct CompositionPair {
CompositionPairCompositionPair96     CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {}
97     UChar32 trail, composite;
98 };
99 
100 struct Norm {
101     enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY };
102 
hasMappingNorm103     UBool hasMapping() const { return mappingType>REMOVED; }
104 
105     // Requires hasMapping() and well-formed mapping.
setMappingCPNorm106     void setMappingCP() {
107         UChar32 c;
108         if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) {
109             mappingCP=c;
110         } else {
111             mappingCP=U_SENTINEL;
112         }
113     }
114 
getCompositionPairsNorm115     const CompositionPair *getCompositionPairs(int32_t &length) const {
116         if(compositions==NULL) {
117             length=0;
118             return NULL;
119         } else {
120             length=compositions->size()/2;
121             return reinterpret_cast<const CompositionPair *>(compositions->getBuffer());
122         }
123     }
124 
125     UnicodeString *mapping;
126     UnicodeString *rawMapping;  // non-NULL if the mapping is further decomposed
127     UChar32 mappingCP;  // >=0 if mapping to 1 code point
128     int32_t mappingPhase;
129     MappingType mappingType;
130 
131     UVector32 *compositions;  // (trail, composite) pairs
132     uint8_t cc;
133     UBool combinesBack;
134     UBool hasNoCompBoundaryAfter;
135 
136     enum OffsetType {
137         OFFSET_NONE,
138         // Composition for back-combining character. Allowed, but not normally used.
139         OFFSET_MAYBE_YES,
140         // Composition for a starter that does not have a decomposition mapping.
141         OFFSET_YES_YES,
142         // Round-trip mapping & composition for a starter.
143         OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
144         // Round-trip mapping for a starter that itself does not combine-forward.
145         OFFSET_YES_NO_MAPPING_ONLY,
146         // One-way mapping.
147         OFFSET_NO_NO,
148         // Delta for an algorithmic one-way mapping.
149         OFFSET_DELTA
150     };
151     enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
152     int32_t offset;
153 };
154 
155 class Normalizer2DBEnumerator {
156 public:
Normalizer2DBEnumerator(Normalizer2DataBuilder & b)157     Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {}
~Normalizer2DBEnumerator()158     virtual ~Normalizer2DBEnumerator() {}
159     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0;
ptr()160     Normalizer2DBEnumerator *ptr() { return this; }
161 protected:
162     Normalizer2DataBuilder &builder;
163 };
164 
165 U_CDECL_BEGIN
166 
167 static UBool U_CALLCONV
enumRangeHandler(const void * context,UChar32 start,UChar32 end,uint32_t value)168 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
169     return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value);
170 }
171 
172 U_CDECL_END
173 
Normalizer2DataBuilder(UErrorCode & errorCode)174 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
175         phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL),
176         norm16TrieLength(0) {
177     memset(unicodeVersion, 0, sizeof(unicodeVersion));
178     normTrie=utrie2_open(0, 0, &errorCode);
179     normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
180     norms=allocNorm();  // unused Norm struct at index 0
181     memset(indexes, 0, sizeof(indexes));
182     memset(smallFCD, 0, sizeof(smallFCD));
183 }
184 
~Normalizer2DataBuilder()185 Normalizer2DataBuilder::~Normalizer2DataBuilder() {
186     utrie2_close(normTrie);
187     int32_t normsLength=utm_countItems(normMem);
188     for(int32_t i=1; i<normsLength; ++i) {
189         delete norms[i].mapping;
190         delete norms[i].rawMapping;
191         delete norms[i].compositions;
192     }
193     utm_close(normMem);
194     utrie2_close(norm16Trie);
195 }
196 
197 void
setUnicodeVersion(const char * v)198 Normalizer2DataBuilder::setUnicodeVersion(const char *v) {
199     UVersionInfo nullVersion={ 0, 0, 0, 0 };
200     UVersionInfo version;
201     u_versionFromString(version, v);
202     if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) &&
203         0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH)
204     ) {
205         char buffer[U_MAX_VERSION_STRING_LENGTH];
206         u_versionToString(unicodeVersion, buffer);
207         fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
208                 buffer, v);
209         exit(U_ILLEGAL_ARGUMENT_ERROR);
210     }
211     memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH);
212 }
213 
allocNorm()214 Norm *Normalizer2DataBuilder::allocNorm() {
215     Norm *p=(Norm *)utm_alloc(normMem);
216     norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
217     return p;
218 }
219 
220 /* get an existing Norm unit */
getNorm(UChar32 c)221 Norm *Normalizer2DataBuilder::getNorm(UChar32 c) {
222     uint32_t i=utrie2_get32(normTrie, c);
223     if(i==0) {
224         return NULL;
225     }
226     return norms+i;
227 }
228 
getNormRef(UChar32 c) const229 const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const {
230     return norms[utrie2_get32(normTrie, c)];
231 }
232 
233 /*
234  * get or create a Norm unit;
235  * get or create the intermediate trie entries for it as well
236  */
createNorm(UChar32 c)237 Norm *Normalizer2DataBuilder::createNorm(UChar32 c) {
238     uint32_t i=utrie2_get32(normTrie, c);
239     if(i!=0) {
240         return norms+i;
241     } else {
242         /* allocate Norm */
243         Norm *p=allocNorm();
244         IcuToolErrorCode errorCode("gennorm2/createNorm()");
245         utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
246         return p;
247     }
248 }
249 
checkNormForMapping(Norm * p,UChar32 c)250 Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) {
251     if(p!=NULL) {
252         if(p->mappingType!=Norm::NONE) {
253             if( overrideHandling==OVERRIDE_NONE ||
254                 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase)
255             ) {
256                 fprintf(stderr,
257                         "error in gennorm2 phase %d: "
258                         "not permitted to override mapping for U+%04lX from phase %d\n",
259                         (int)phase, (long)c, (int)p->mappingPhase);
260                 exit(U_INVALID_FORMAT_ERROR);
261             }
262             delete p->mapping;
263             p->mapping=NULL;
264         }
265         p->mappingPhase=phase;
266     }
267     return p;
268 }
269 
setOverrideHandling(OverrideHandling oh)270 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
271     overrideHandling=oh;
272     ++phase;
273 }
274 
setCC(UChar32 c,uint8_t cc)275 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
276     createNorm(c)->cc=cc;
277 }
278 
getCC(UChar32 c) const279 uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const {
280     return getNormRef(c).cc;
281 }
282 
isWellFormed(const UnicodeString & s)283 static UBool isWellFormed(const UnicodeString &s) {
284     UErrorCode errorCode=U_ZERO_ERROR;
285     u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode);
286     return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR;
287 }
288 
setOneWayMapping(UChar32 c,const UnicodeString & m)289 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) {
290     if(!isWellFormed(m)) {
291         fprintf(stderr,
292                 "error in gennorm2 phase %d: "
293                 "illegal one-way mapping from U+%04lX to malformed string\n",
294                 (int)phase, (long)c);
295         exit(U_INVALID_FORMAT_ERROR);
296     }
297     Norm *p=checkNormForMapping(createNorm(c), c);
298     p->mapping=new UnicodeString(m);
299     p->mappingType=Norm::ONE_WAY;
300     p->setMappingCP();
301 }
302 
setRoundTripMapping(UChar32 c,const UnicodeString & m)303 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
304     if(U_IS_SURROGATE(c)) {
305         fprintf(stderr,
306                 "error in gennorm2 phase %d: "
307                 "illegal round-trip mapping from surrogate code point U+%04lX\n",
308                 (int)phase, (long)c);
309         exit(U_INVALID_FORMAT_ERROR);
310     }
311     if(!isWellFormed(m)) {
312         fprintf(stderr,
313                 "error in gennorm2 phase %d: "
314                 "illegal round-trip mapping from U+%04lX to malformed string\n",
315                 (int)phase, (long)c);
316         exit(U_INVALID_FORMAT_ERROR);
317     }
318     int32_t numCP=u_countChar32(m.getBuffer(), m.length());
319     if(numCP!=2) {
320         fprintf(stderr,
321                 "error in gennorm2 phase %d: "
322                 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
323                 (int)phase, (long)c, (int)numCP);
324         exit(U_INVALID_FORMAT_ERROR);
325     }
326     Norm *p=checkNormForMapping(createNorm(c), c);
327     p->mapping=new UnicodeString(m);
328     p->mappingType=Norm::ROUND_TRIP;
329     p->mappingCP=U_SENTINEL;
330 }
331 
removeMapping(UChar32 c)332 void Normalizer2DataBuilder::removeMapping(UChar32 c) {
333     Norm *p=checkNormForMapping(getNorm(c), c);
334     if(p!=NULL) {
335         p->mappingType=Norm::REMOVED;
336     }
337 }
338 
339 class CompositionBuilder : public Normalizer2DBEnumerator {
340 public:
CompositionBuilder(Normalizer2DataBuilder & b)341     CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
rangeHandler(UChar32 start,UChar32 end,uint32_t value)342     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
343         builder.addComposition(start, end, value);
344         return TRUE;
345     }
346 };
347 
348 void
addComposition(UChar32 start,UChar32 end,uint32_t value)349 Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) {
350     if(norms[value].mappingType==Norm::ROUND_TRIP) {
351         if(start!=end) {
352             fprintf(stderr,
353                     "gennorm2 error: same round-trip mapping for "
354                     "more than 1 code point U+%04lX..U+%04lX\n",
355                     (long)start, (long)end);
356             exit(U_INVALID_FORMAT_ERROR);
357         }
358         if(norms[value].cc!=0) {
359             fprintf(stderr,
360                     "gennorm2 error: "
361                     "U+%04lX has a round-trip mapping and ccc!=0, "
362                     "not possible in Unicode normalization\n",
363                     (long)start);
364             exit(U_INVALID_FORMAT_ERROR);
365         }
366         // setRoundTripMapping() ensured that there are exactly two code points.
367         const UnicodeString &m=*norms[value].mapping;
368         UChar32 lead=m.char32At(0);
369         UChar32 trail=m.char32At(m.length()-1);
370         if(getCC(lead)!=0) {
371             fprintf(stderr,
372                     "gennorm2 error: "
373                     "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
374                     "not possible in Unicode normalization\n",
375                     (long)start, (long)lead);
376             exit(U_INVALID_FORMAT_ERROR);
377         }
378         // Flag for trailing character.
379         createNorm(trail)->combinesBack=TRUE;
380         // Insert (trail, composite) pair into compositions list for the lead character.
381         IcuToolErrorCode errorCode("gennorm2/addComposition()");
382         Norm *leadNorm=createNorm(lead);
383         UVector32 *compositions=leadNorm->compositions;
384         int32_t i;
385         if(compositions==NULL) {
386             compositions=leadNorm->compositions=new UVector32(errorCode);
387             i=0;  // "insert" the first pair at index 0
388         } else {
389             // Insertion sort, and check for duplicate trail characters.
390             int32_t length;
391             const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
392             for(i=0; i<length; ++i) {
393                 if(trail==pairs[i].trail) {
394                     fprintf(stderr,
395                             "gennorm2 error: same round-trip mapping for "
396                             "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
397                             (long)start, (long)lead, (long)trail);
398                     exit(U_INVALID_FORMAT_ERROR);
399                 }
400                 if(trail<pairs[i].trail) {
401                     break;
402                 }
403             }
404         }
405         compositions->insertElementAt(trail, 2*i, errorCode);
406         compositions->insertElementAt(start, 2*i+1, errorCode);
407     }
408 }
409 
combinesWithCCBetween(const Norm & norm,uint8_t lowCC,uint8_t highCC) const410 UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm,
411                                                     uint8_t lowCC, uint8_t highCC) const {
412     if((highCC-lowCC)>=2) {
413         int32_t length;
414         const CompositionPair *pairs=norm.getCompositionPairs(length);
415         for(int32_t i=0; i<length; ++i) {
416             uint8_t trailCC=getCC(pairs[i].trail);
417             if(lowCC<trailCC && trailCC<highCC) {
418                 return TRUE;
419             }
420         }
421     }
422     return FALSE;
423 }
424 
combine(const Norm & norm,UChar32 trail) const425 UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const {
426     int32_t length;
427     const CompositionPair *pairs=norm.getCompositionPairs(length);
428     for(int32_t i=0; i<length; ++i) {
429         if(trail==pairs[i].trail) {
430             return pairs[i].composite;
431         }
432         if(trail<pairs[i].trail) {
433             break;
434         }
435     }
436     return U_SENTINEL;
437 }
438 
439 class Decomposer : public Normalizer2DBEnumerator {
440 public:
Decomposer(Normalizer2DataBuilder & b)441     Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {}
rangeHandler(UChar32 start,UChar32 end,uint32_t value)442     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
443         didDecompose|=builder.decompose(start, end, value);
444         return TRUE;
445     }
446     UBool didDecompose;
447 };
448 
449 UBool
decompose(UChar32 start,UChar32 end,uint32_t value)450 Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
451     if(norms[value].hasMapping()) {
452         Norm &norm=norms[value];
453         const UnicodeString &m=*norm.mapping;
454         UnicodeString *decomposed=NULL;
455         const UChar *s=m.getBuffer();
456         int32_t length=m.length();
457         int32_t prev, i=0;
458         UChar32 c;
459         while(i<length) {
460             prev=i;
461             U16_NEXT(s, i, length, c);
462             if(start<=c && c<=end) {
463                 fprintf(stderr,
464                         "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
465                         (long)c);
466                 exit(U_INVALID_FORMAT_ERROR);
467             }
468             const Norm &cNorm=getNormRef(c);
469             if(cNorm.hasMapping()) {
470                 if(norm.mappingType==Norm::ROUND_TRIP) {
471                     if(prev==0) {
472                         if(cNorm.mappingType!=Norm::ROUND_TRIP) {
473                             fprintf(stderr,
474                                     "gennorm2 error: "
475                                     "U+%04lX's round-trip mapping's starter "
476                                     "U+%04lX one-way-decomposes, "
477                                     "not possible in Unicode normalization\n",
478                                     (long)start, (long)c);
479                             exit(U_INVALID_FORMAT_ERROR);
480                         }
481                         uint8_t myTrailCC=getCC(m.char32At(i));
482                         UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
483                         uint8_t cTrailCC=getCC(cTrailChar);
484                         if(cTrailCC>myTrailCC) {
485                             fprintf(stderr,
486                                     "gennorm2 error: "
487                                     "U+%04lX's round-trip mapping's starter "
488                                     "U+%04lX decomposes and the "
489                                     "inner/earlier tccc=%hu > outer/following tccc=%hu, "
490                                     "not possible in Unicode normalization\n",
491                                     (long)start, (long)c,
492                                     (short)cTrailCC, (short)myTrailCC);
493                             exit(U_INVALID_FORMAT_ERROR);
494                         }
495                     } else {
496                         fprintf(stderr,
497                                 "gennorm2 error: "
498                                 "U+%04lX's round-trip mapping's non-starter "
499                                 "U+%04lX decomposes, "
500                                 "not possible in Unicode normalization\n",
501                                 (long)start, (long)c);
502                         exit(U_INVALID_FORMAT_ERROR);
503                     }
504                 }
505                 if(decomposed==NULL) {
506                     decomposed=new UnicodeString(m, 0, prev);
507                 }
508                 decomposed->append(*cNorm.mapping);
509             } else if(Hangul::isHangul(c)) {
510                 UChar buffer[3];
511                 int32_t hangulLength=Hangul::decompose(c, buffer);
512                 if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
513                     fprintf(stderr,
514                             "gennorm2 error: "
515                             "U+%04lX's round-trip mapping's non-starter "
516                             "U+%04lX decomposes, "
517                             "not possible in Unicode normalization\n",
518                             (long)start, (long)c);
519                     exit(U_INVALID_FORMAT_ERROR);
520                 }
521                 if(decomposed==NULL) {
522                     decomposed=new UnicodeString(m, 0, prev);
523                 }
524                 decomposed->append(buffer, hangulLength);
525             } else if(decomposed!=NULL) {
526                 decomposed->append(m, prev, i-prev);
527             }
528         }
529         if(decomposed!=NULL) {
530             if(norm.rawMapping==NULL) {
531                 // Remember the original mapping when decomposing recursively.
532                 norm.rawMapping=norm.mapping;
533             } else {
534                 delete norm.mapping;
535             }
536             norm.mapping=decomposed;
537             // Not  norm.setMappingCP();  because the original mapping
538             // is most likely to be encodable as a delta.
539             return TRUE;
540         }
541     }
542     return FALSE;
543 }
544 
545 class BuilderReorderingBuffer {
546 public:
BuilderReorderingBuffer()547     BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {}
reset()548     void reset() {
549         fLength=0;
550         fLastStarterIndex=-1;
551         fDidReorder=FALSE;
552     }
length() const553     int32_t length() const { return fLength; }
isEmpty() const554     UBool isEmpty() const { return fLength==0; }
lastStarterIndex() const555     int32_t lastStarterIndex() const { return fLastStarterIndex; }
charAt(int32_t i) const556     UChar32 charAt(int32_t i) const { return fArray[i]>>8; }
ccAt(int32_t i) const557     uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; }
didReorder() const558     UBool didReorder() const { return fDidReorder; }
append(UChar32 c,uint8_t cc)559     void append(UChar32 c, uint8_t cc) {
560         if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
561             if(cc==0) {
562                 fLastStarterIndex=fLength;
563             }
564             fArray[fLength++]=(c<<8)|cc;
565             return;
566         }
567         // Let this character bubble back to its canonical order.
568         int32_t i=fLength-1;
569         while(i>fLastStarterIndex && ccAt(i)>cc) {
570             --i;
571         }
572         ++i;  // after the last starter or prevCC<=cc
573         // Move this and the following characters forward one to make space.
574         for(int32_t j=fLength; i<j; --j) {
575             fArray[j]=fArray[j-1];
576         }
577         fArray[i]=(c<<8)|cc;
578         ++fLength;
579         fDidReorder=TRUE;
580     }
toString(UnicodeString & dest)581     void toString(UnicodeString &dest) {
582         dest.remove();
583         for(int32_t i=0; i<fLength; ++i) {
584             dest.append(charAt(i));
585         }
586     }
setComposite(UChar32 composite,int32_t combMarkIndex)587     void setComposite(UChar32 composite, int32_t combMarkIndex) {
588         fArray[fLastStarterIndex]=composite<<8;
589         // Remove the combining mark that contributed to the composite.
590         --fLength;
591         while(combMarkIndex<fLength) {
592             fArray[combMarkIndex]=fArray[combMarkIndex+1];
593             ++combMarkIndex;
594         }
595     }
596 private:
597     int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
598     int32_t fLength;
599     int32_t fLastStarterIndex;
600     UBool fDidReorder;
601 };
602 
603 void
reorder(Norm * p,BuilderReorderingBuffer & buffer)604 Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) {
605     UnicodeString &m=*p->mapping;
606     int32_t length=m.length();
607     if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
608         return;  // writeMapping() will complain about it and print the code point.
609     }
610     const UChar *s=m.getBuffer();
611     int32_t i=0;
612     UChar32 c;
613     while(i<length) {
614         U16_NEXT(s, i, length, c);
615         buffer.append(c, getCC(c));
616     }
617     if(buffer.didReorder()) {
618         buffer.toString(m);
619     }
620 }
621 
622 /*
623  * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter().
624  * A starter character with a mapping does not have a composition boundary after it
625  * if the character itself combines-forward (which is tested by the caller of this function),
626  * or it is deleted (mapped to the empty string),
627  * or its mapping contains no starter,
628  * or the last starter combines-forward.
629  */
hasNoCompBoundaryAfter(BuilderReorderingBuffer & buffer)630 UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) {
631     if(buffer.isEmpty()) {
632         return TRUE;  // maps-to-empty-string is no boundary of any kind
633     }
634     int32_t lastStarterIndex=buffer.lastStarterIndex();
635     if(lastStarterIndex<0) {
636         return TRUE;  // no starter
637     }
638     UChar32 starter=buffer.charAt(lastStarterIndex);
639     if( Hangul::isJamoL(starter) ||
640         (Hangul::isJamoV(starter) &&
641          0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))
642     ) {
643         // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
644         // otherwise it is blocked.
645         return lastStarterIndex==buffer.length()-1;
646     }
647     // Note: There can be no Hangul syllable in the fully decomposed mapping.
648     const Norm *starterNorm=&getNormRef(starter);
649     if(starterNorm->compositions==NULL) {
650         return FALSE;  // the last starter does not combine forward
651     }
652     // Compose as far as possible, and see if further compositions are possible.
653     uint8_t prevCC=0;
654     for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) {
655         uint8_t cc=buffer.ccAt(combMarkIndex);  // !=0 because after last starter
656         if(combinesWithCCBetween(*starterNorm, prevCC, cc)) {
657             return TRUE;
658         }
659         if( prevCC<cc &&
660             (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0
661         ) {
662             buffer.setComposite(starter, combMarkIndex);
663             starterNorm=&getNormRef(starter);
664             if(starterNorm->compositions==NULL) {
665                 return FALSE;  // the composite does not combine further
666             }
667         } else {
668             prevCC=cc;
669             ++combMarkIndex;
670         }
671     }
672     // TRUE if the final, forward-combining starter is at the end.
673     return prevCC==0;
674 }
675 
676 // Requires p->hasMapping().
677 // Returns the offset of the "first unit" from the beginning of the extraData for c.
678 // That is the same as the length of the optional data for the raw mapping and the ccc/lccc word.
writeMapping(UChar32 c,const Norm * p,UnicodeString & dataString)679 int32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) {
680     UnicodeString &m=*p->mapping;
681     int32_t length=m.length();
682     if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
683         fprintf(stderr,
684                 "gennorm2 error: "
685                 "mapping for U+%04lX longer than maximum of %d\n",
686                 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
687         exit(U_INVALID_FORMAT_ERROR);
688     }
689     int32_t leadCC, trailCC;
690     if(length==0) {
691         leadCC=trailCC=0;
692     } else {
693         leadCC=getCC(m.char32At(0));
694         trailCC=getCC(m.char32At(length-1));
695     }
696     if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) {
697         fprintf(stderr,
698                 "gennorm2 error: "
699                 "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
700                 (long)c);
701         exit(U_INVALID_FORMAT_ERROR);
702     }
703     // Write small-FCD data.
704     if((leadCC|trailCC)!=0) {
705         UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
706         smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
707     }
708     // Write the mapping & raw mapping extraData.
709     int32_t firstUnit=length|(trailCC<<8);
710     int32_t preMappingLength=0;
711     if(p->rawMapping!=NULL) {
712         UnicodeString &rm=*p->rawMapping;
713         int32_t rmLength=rm.length();
714         if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
715             fprintf(stderr,
716                     "gennorm2 error: "
717                     "raw mapping for U+%04lX longer than maximum of %d\n",
718                     (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
719             exit(U_INVALID_FORMAT_ERROR);
720         }
721         UChar rm0=rm.charAt(0);
722         if( rmLength==length-1 &&
723             // 99: overlong substring lengths get pinned to remainder lengths anyway
724             0==rm.compare(1, 99, m, 2, 99) &&
725             rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
726         ) {
727             // Compression:
728             // rawMapping=rm0+mapping.substring(2) -> store only rm0
729             //
730             // The raw mapping is the same as the final mapping after replacing
731             // the final mapping's first two code units with the raw mapping's first one.
732             // In this case, we store only that first unit, rm0.
733             // This helps with a few hundred mappings.
734             dataString.append(rm0);
735             preMappingLength=1;
736         } else {
737             // Store the raw mapping with its length.
738             dataString.append(rm);
739             dataString.append((UChar)rmLength);
740             preMappingLength=rmLength+1;
741         }
742         firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
743     }
744     int32_t cccLccc=p->cc|(leadCC<<8);
745     if(cccLccc!=0) {
746         dataString.append((UChar)cccLccc);
747         ++preMappingLength;
748         firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
749     }
750     if(p->hasNoCompBoundaryAfter) {
751         firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
752     }
753     dataString.append((UChar)firstUnit);
754     dataString.append(m);
755     return preMappingLength;
756 }
757 
758 // Requires p->compositions!=NULL.
writeCompositions(UChar32 c,const Norm * p,UnicodeString & dataString)759 void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) {
760     if(p->cc!=0) {
761         fprintf(stderr,
762                 "gennorm2 error: "
763                 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
764                 (long)c);
765         exit(U_INVALID_FORMAT_ERROR);
766     }
767     int32_t length;
768     const CompositionPair *pairs=p->getCompositionPairs(length);
769     for(int32_t i=0; i<length; ++i) {
770         const CompositionPair &pair=pairs[i];
771         // 22 bits for the composite character and whether it combines forward.
772         UChar32 compositeAndFwd=pair.composite<<1;
773         if(getNormRef(pair.composite).compositions!=NULL) {
774             compositeAndFwd|=1;  // The composite character also combines-forward.
775         }
776         // Encode most pairs in two units and some in three.
777         int32_t firstUnit, secondUnit, thirdUnit;
778         if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
779             if(compositeAndFwd<=0xffff) {
780                 firstUnit=pair.trail<<1;
781                 secondUnit=compositeAndFwd;
782                 thirdUnit=-1;
783             } else {
784                 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
785                 secondUnit=compositeAndFwd>>16;
786                 thirdUnit=compositeAndFwd;
787             }
788         } else {
789             firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
790                        (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
791                       Normalizer2Impl::COMP_1_TRIPLE;
792             secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
793                        (compositeAndFwd>>16);
794             thirdUnit=compositeAndFwd;
795         }
796         // Set the high bit of the first unit if this is the last composition pair.
797         if(i==(length-1)) {
798             firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
799         }
800         dataString.append((UChar)firstUnit).append((UChar)secondUnit);
801         if(thirdUnit>=0) {
802             dataString.append((UChar)thirdUnit);
803         }
804     }
805 }
806 
807 class ExtraDataWriter : public Normalizer2DBEnumerator {
808 public:
ExtraDataWriter(Normalizer2DataBuilder & b)809     ExtraDataWriter(Normalizer2DataBuilder &b) :
810         Normalizer2DBEnumerator(b),
811         yesYesCompositions(1000, (UChar32)0xffff, 2),  // 0=inert, 1=Jamo L, 2=start of compositions
812         yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {}  // 0=Hangul, 1=start of normal data
rangeHandler(UChar32 start,UChar32 end,uint32_t value)813     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
814         if(value!=0) {
815             if(start!=end) {
816                 fprintf(stderr,
817                         "gennorm2 error: unexpected shared data for "
818                         "multiple code points U+%04lX..U+%04lX\n",
819                         (long)start, (long)end);
820                 exit(U_INTERNAL_PROGRAM_ERROR);
821             }
822             builder.writeExtraData(start, value, *this);
823         }
824         return TRUE;
825     }
826     UnicodeString maybeYesCompositions;
827     UnicodeString yesYesCompositions;
828     UnicodeString yesNoMappingsAndCompositions;
829     UnicodeString yesNoMappingsOnly;
830     UnicodeString noNoMappings;
831     Hashtable previousNoNoMappings;  // If constructed in runtime code, pass in UErrorCode.
832 };
833 
writeExtraData(UChar32 c,uint32_t value,ExtraDataWriter & writer)834 void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) {
835     Norm *p=norms+value;
836     if(!p->hasMapping()) {
837         // Write small-FCD data.
838         // There is similar code in writeMapping() for characters that do have a mapping.
839         if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) {
840             fprintf(stderr,
841                     "gennorm2 error: "
842                     "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
843                     (long)c);
844             exit(U_INVALID_FORMAT_ERROR);
845         }
846         if(p->cc!=0) {
847             UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
848             smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
849         }
850     }
851     if(p->combinesBack) {
852         if(p->hasMapping()) {
853             fprintf(stderr,
854                     "gennorm2 error: "
855                     "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
856                     (long)c);
857             exit(U_INVALID_FORMAT_ERROR);
858         }
859         if(p->compositions!=NULL) {
860             p->offset=
861                 (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
862                 Norm::OFFSET_MAYBE_YES;
863             writeCompositions(c, p, writer.maybeYesCompositions);
864         }
865     } else if(!p->hasMapping()) {
866         if(p->compositions!=NULL) {
867             p->offset=
868                 (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
869                 Norm::OFFSET_YES_YES;
870             writeCompositions(c, p, writer.yesYesCompositions);
871         }
872     } else if(p->mappingType==Norm::ROUND_TRIP) {
873         if(p->compositions!=NULL) {
874             int32_t offset=writer.yesNoMappingsAndCompositions.length()+
875                            writeMapping(c, p, writer.yesNoMappingsAndCompositions);
876             p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
877             writeCompositions(c, p, writer.yesNoMappingsAndCompositions);
878         } else {
879             int32_t offset=writer.yesNoMappingsOnly.length()+
880                            writeMapping(c, p, writer.yesNoMappingsOnly);
881             p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
882         }
883     } else /* one-way */ {
884         if(p->compositions!=NULL) {
885             fprintf(stderr,
886                     "gennorm2 error: "
887                     "U+%04lX combines-forward and has a one-way mapping, "
888                     "not possible in Unicode normalization\n",
889                     (long)c);
890             exit(U_INVALID_FORMAT_ERROR);
891         }
892         if(p->cc==0 && optimization!=OPTIMIZE_FAST) {
893             // Try a compact, algorithmic encoding.
894             // Only for ccc=0, because we can't store additional information
895             // and we do not recursively follow an algorithmic encoding for access to the ccc.
896             //
897             // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
898             // if the mappingCP decomposes further, to ensure that there is a place to store it.
899             // We want to see that the final mapping does not have exactly 1 code point,
900             // or else we would have to recursively ensure that the final mapping is stored
901             // in normal extraData.
902             if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) {
903                 int32_t delta=p->mappingCP-c;
904                 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
905                     p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
906                 }
907             }
908         }
909         if(p->offset==0) {
910             int32_t oldNoNoLength=writer.noNoMappings.length();
911             int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings);
912             UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength);
913             int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping);
914             if(previousOffset!=0) {
915                 // Duplicate, remove the new units and point to the old ones.
916                 writer.noNoMappings.truncate(oldNoNoLength);
917                 p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
918             } else {
919                 // Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
920                 IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
921                 writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode);
922                 p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
923             }
924         }
925     }
926 }
927 
928 class Norm16Writer : public Normalizer2DBEnumerator {
929 public:
Norm16Writer(Normalizer2DataBuilder & b)930     Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
rangeHandler(UChar32 start,UChar32 end,uint32_t value)931     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
932         builder.writeNorm16(start, end, value);
933         return TRUE;
934     }
935 };
936 
writeNorm16(UChar32 start,UChar32 end,uint32_t value)937 void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) {
938     if(value!=0) {
939         const Norm *p=norms+value;
940         int32_t offset=p->offset>>Norm::OFFSET_SHIFT;
941         int32_t norm16=0;
942         UBool isDecompNo=FALSE;
943         UBool isCompNoMaybe=FALSE;
944         switch(p->offset&Norm::OFFSET_MASK) {
945         case Norm::OFFSET_NONE:
946             // No mapping, no compositions list.
947             if(p->combinesBack) {
948                 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc;
949                 isDecompNo=(UBool)(p->cc!=0);
950                 isCompNoMaybe=TRUE;
951             } else if(p->cc!=0) {
952                 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc;
953                 isDecompNo=isCompNoMaybe=TRUE;
954             }
955             break;
956         case Norm::OFFSET_MAYBE_YES:
957             norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset;
958             isCompNoMaybe=TRUE;
959             break;
960         case Norm::OFFSET_YES_YES:
961             norm16=offset;
962             break;
963         case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
964             norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
965             isDecompNo=TRUE;
966             break;
967         case Norm::OFFSET_YES_NO_MAPPING_ONLY:
968             norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
969             isDecompNo=TRUE;
970             break;
971         case Norm::OFFSET_NO_NO:
972             norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
973             isDecompNo=isCompNoMaybe=TRUE;
974             break;
975         case Norm::OFFSET_DELTA:
976             norm16=getCenterNoNoDelta()+offset;
977             isDecompNo=isCompNoMaybe=TRUE;
978             break;
979         default:  // Should not occur.
980             exit(U_INTERNAL_PROGRAM_ERROR);
981         }
982         IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
983         utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
984         if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
985             indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
986         }
987         if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
988             indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
989         }
990     }
991 }
992 
setHangulData()993 void Normalizer2DataBuilder::setHangulData() {
994     HangulIterator hi;
995     const HangulIterator::Range *range;
996     // Check that none of the Hangul/Jamo code points have data.
997     while((range=hi.nextRange())!=NULL) {
998         for(UChar32 c=range->start; c<range->limit; ++c) {
999             if(utrie2_get32(norm16Trie, c)!=0) {
1000                 fprintf(stderr,
1001                         "gennorm2 error: "
1002                         "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
1003                         (long)c);
1004                 exit(U_INVALID_FORMAT_ERROR);
1005             }
1006         }
1007     }
1008     // Set data for algorithmic runtime handling.
1009     IcuToolErrorCode errorCode("gennorm2/setHangulData()");
1010     hi.reset();
1011     while((range=hi.nextRange())!=NULL) {
1012         uint16_t norm16=range->norm16;
1013         if(norm16==0) {
1014             norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO];  // Hangul LV/LVT encoded as minYesNo
1015             if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
1016                 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start;
1017             }
1018         } else {
1019             if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {  // Jamo V/T are maybeYes
1020                 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start;
1021             }
1022         }
1023         utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode);
1024         errorCode.assertSuccess();
1025     }
1026 }
1027 
1028 U_CDECL_BEGIN
1029 
1030 static UBool U_CALLCONV
enumRangeMaxValue(const void * context,UChar32,UChar32,uint32_t value)1031 enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
1032     uint32_t *pMaxValue=(uint32_t *)context;
1033     if(value>*pMaxValue) {
1034         *pMaxValue=value;
1035     }
1036     return TRUE;
1037 }
1038 
1039 U_CDECL_END
1040 
processData()1041 void Normalizer2DataBuilder::processData() {
1042     IcuToolErrorCode errorCode("gennorm2/processData()");
1043     norm16Trie=utrie2_open(0, 0, errorCode);
1044     errorCode.assertSuccess();
1045 
1046     utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr());
1047 
1048     Decomposer decomposer(*this);
1049     do {
1050         decomposer.didDecompose=FALSE;
1051         utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer);
1052     } while(decomposer.didDecompose);
1053 
1054     BuilderReorderingBuffer buffer;
1055     int32_t normsLength=utm_countItems(normMem);
1056     for(int32_t i=1; i<normsLength; ++i) {
1057         // Set the hasNoCompBoundaryAfter flag for use by the last code branch
1058         // in Normalizer2Impl::hasCompBoundaryAfter().
1059         // For details see the comments on hasNoCompBoundaryAfter(buffer).
1060         const Norm &norm=norms[i];
1061         if(norm.hasMapping()) {
1062             if(norm.compositions!=NULL) {
1063                 norms[i].hasNoCompBoundaryAfter=TRUE;
1064             } else {
1065                 buffer.reset();
1066                 reorder(norms+i, buffer);
1067                 norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
1068             }
1069         }
1070     }
1071 
1072     indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
1073     indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
1074 
1075     ExtraDataWriter extraDataWriter(*this);
1076     utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter);
1077 
1078     extraData=extraDataWriter.maybeYesCompositions;
1079     extraData.append(extraDataWriter.yesYesCompositions).
1080               append(extraDataWriter.yesNoMappingsAndCompositions).
1081               append(extraDataWriter.yesNoMappingsOnly).
1082               append(extraDataWriter.noNoMappings);
1083     // Pad to even length for 4-byte alignment of following data.
1084     if(extraData.length()&1) {
1085         extraData.append((UChar)0);
1086     }
1087 
1088     indexes[Normalizer2Impl::IX_MIN_YES_NO]=
1089         extraDataWriter.yesYesCompositions.length();
1090     indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=
1091         indexes[Normalizer2Impl::IX_MIN_YES_NO]+
1092         extraDataWriter.yesNoMappingsAndCompositions.length();
1093     indexes[Normalizer2Impl::IX_MIN_NO_NO]=
1094         indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+
1095         extraDataWriter.yesNoMappingsOnly.length();
1096     indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
1097         indexes[Normalizer2Impl::IX_MIN_NO_NO]+
1098         extraDataWriter.noNoMappings.length();
1099     indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
1100         Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
1101         extraDataWriter.maybeYesCompositions.length();
1102 
1103     int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
1104     if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
1105         fprintf(stderr,
1106                 "gennorm2 error: "
1107                 "data structure overflow, too much mapping composition data\n");
1108         exit(U_BUFFER_OVERFLOW_ERROR);
1109     }
1110 
1111     utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr());
1112 
1113     setHangulData();
1114 
1115     // Look for the "worst" norm16 value of any supplementary code point
1116     // corresponding to a lead surrogate, and set it as that surrogate's value.
1117     // Enables quick check inner loops to look at only code units.
1118     //
1119     // We could be more sophisticated:
1120     // We could collect a bit set for whether there are values in the different
1121     // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
1122     // and select the best value that only breaks the composition and/or decomposition
1123     // inner loops if necessary.
1124     // However, that seems like overkill for an optimization for supplementary characters.
1125     for(UChar lead=0xd800; lead<0xdc00; ++lead) {
1126         uint32_t maxValue=utrie2_get32(norm16Trie, lead);
1127         utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue);
1128         if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] &&
1129             maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]
1130         ) {
1131             // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
1132             // Otherwise it might end up at something like JAMO_VT which stays in
1133             // the inner decomposition quick check loop.
1134             maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1;
1135         }
1136         utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode);
1137     }
1138 
1139     // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
1140     // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
1141     // which is harmless.
1142     // As a result, the minimum code points are always BMP code points.
1143     int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP];
1144     if(minCP>=0x10000) {
1145         indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP);
1146     }
1147     minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP];
1148     if(minCP>=0x10000) {
1149         indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
1150     }
1151 
1152     utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode);
1153     norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode);
1154     if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) {
1155         fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n",
1156                 errorCode.errorName());
1157         exit(errorCode.reset());
1158     }
1159     errorCode.reset();
1160 
1161     int32_t offset=(int32_t)sizeof(indexes);
1162     indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset;
1163     offset+=norm16TrieLength;
1164     indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset;
1165     offset+=extraData.length()*2;
1166     indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset;
1167     offset+=sizeof(smallFCD);
1168     int32_t totalSize=offset;
1169     for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
1170         indexes[i]=totalSize;
1171     }
1172 
1173     if(beVerbose) {
1174         printf("size of normalization trie:         %5ld bytes\n", (long)norm16TrieLength);
1175         printf("size of 16-bit extra data:          %5ld uint16_t\n", (long)extraData.length());
1176         printf("size of small-FCD data:             %5ld bytes\n", (long)sizeof(smallFCD));
1177         printf("size of binary data file contents:  %5ld bytes\n", (long)totalSize);
1178         printf("minDecompNoCodePoint:              U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
1179         printf("minCompNoMaybeCodePoint:           U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
1180         printf("minYesNo:                          0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
1181         printf("minYesNoMappingsOnly:              0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
1182         printf("minNoNo:                           0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
1183         printf("limitNoNo:                         0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
1184         printf("minMaybeYes:                       0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
1185     }
1186 
1187     UVersionInfo nullVersion={ 0, 0, 0, 0 };
1188     if(0==memcmp(nullVersion, unicodeVersion, 4)) {
1189         u_versionFromString(unicodeVersion, U_UNICODE_VERSION);
1190     }
1191     memcpy(dataInfo.dataVersion, unicodeVersion, 4);
1192 }
1193 
writeBinaryFile(const char * filename)1194 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
1195     processData();
1196 
1197     IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()");
1198     LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
1199     utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
1200     errorCode.assertSuccess();
1201 
1202     UNewDataMemory *pData=
1203         udata_create(NULL, NULL, filename, &dataInfo,
1204                      haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode);
1205     if(errorCode.isFailure()) {
1206         fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n",
1207                 filename, errorCode.errorName());
1208         exit(errorCode.reset());
1209     }
1210     udata_writeBlock(pData, indexes, sizeof(indexes));
1211     udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength);
1212     udata_writeUString(pData, extraData.getBuffer(), extraData.length());
1213     udata_writeBlock(pData, smallFCD, sizeof(smallFCD));
1214     int32_t writtenSize=udata_finish(pData, errorCode);
1215     if(errorCode.isFailure()) {
1216         fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
1217         exit(errorCode.reset());
1218     }
1219     int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
1220     if(writtenSize!=totalSize) {
1221         fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n",
1222             (long)writtenSize, (long)totalSize);
1223         exit(U_INTERNAL_PROGRAM_ERROR);
1224     }
1225 }
1226 
1227 void
writeCSourceFile(const char * filename)1228 Normalizer2DataBuilder::writeCSourceFile(const char *filename) {
1229     processData();
1230 
1231     IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()");
1232     const char *basename=findBasename(filename);
1233     CharString path(filename, (int32_t)(basename-filename), errorCode);
1234     CharString dataName(basename, errorCode);
1235     const char *extension=strrchr(basename, '.');
1236     if(extension!=NULL) {
1237         dataName.truncate((int32_t)(extension-basename));
1238     }
1239     errorCode.assertSuccess();
1240 
1241     LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
1242     utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
1243     errorCode.assertSuccess();
1244 
1245     FILE *f=usrc_create(path.data(), basename, "icu/source/tools/gennorm2/n2builder.cpp");
1246     if(f==NULL) {
1247         fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n",
1248                 filename);
1249         exit(U_FILE_ACCESS_ERROR);
1250         return;
1251     }
1252     fputs("#ifdef INCLUDED_FROM_NORMALIZER2_CPP\n\n", f);
1253     char line[100];
1254     sprintf(line, "static const UVersionInfo %s_formatVersion={", dataName.data());
1255     usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "};\n");
1256     sprintf(line, "static const UVersionInfo %s_dataVersion={", dataName.data());
1257     usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "};\n\n");
1258     sprintf(line, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n",
1259             dataName.data());
1260     usrc_writeArray(f,
1261         line,
1262         indexes, 32, Normalizer2Impl::IX_COUNT,
1263         "\n};\n\n");
1264     sprintf(line, "static const uint16_t %s_trieIndex[%%ld]={\n", dataName.data());
1265     usrc_writeUTrie2Arrays(f,
1266         line, NULL,
1267         norm16Trie,
1268         "\n};\n\n");
1269     sprintf(line, "static const uint16_t %s_extraData[%%ld]={\n", dataName.data());
1270     usrc_writeArray(f,
1271         line,
1272         extraData.getBuffer(), 16, extraData.length(),
1273         "\n};\n\n");
1274     sprintf(line, "static const uint8_t %s_smallFCD[%%ld]={\n", dataName.data());
1275     usrc_writeArray(f,
1276         line,
1277         smallFCD, 8, sizeof(smallFCD),
1278         "\n};\n\n");
1279     /*fputs(  // TODO
1280         "static const UCaseProps %s_singleton={\n"
1281         "  NULL,\n"
1282         "  %s_indexes,\n"
1283         "  %s_extraData,\n"
1284         "  %s_smallFCD,\n",
1285         f);*/
1286     sprintf(line, "static const UTrie2 %s_trie={\n", dataName.data());
1287     char line2[100];
1288     sprintf(line2, "%s_trieIndex", dataName.data());
1289     usrc_writeUTrie2Struct(f,
1290         line,
1291         norm16Trie, line2, NULL,
1292         "};\n");
1293     fputs("\n#endif  // INCLUDED_FROM_NORMALIZER2_CPP\n", f);
1294     fclose(f);
1295 }
1296 
1297 U_NAMESPACE_END
1298 
1299 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1300 
1301 /*
1302  * Hey, Emacs, please set the following:
1303  *
1304  * Local Variables:
1305  * indent-tabs-mode: nil
1306  * End:
1307  */
1308