1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2009-2014, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  n2builder.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2009nov25
14 *   created by: Markus W. Scherer
15 *
16 * Builds Normalizer2 data and writes a binary .nrm file.
17 * For the file format see source/common/normalizer2impl.h.
18 */
19 
20 #include "unicode/utypes.h"
21 #include "n2builder.h"
22 
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #if U_HAVE_STD_STRING
27 #include <vector>
28 #endif
29 #include "unicode/errorcode.h"
30 #include "unicode/localpointer.h"
31 #include "unicode/putil.h"
32 #include "unicode/udata.h"
33 #include "unicode/uniset.h"
34 #include "unicode/unistr.h"
35 #include "unicode/ustring.h"
36 #include "charstr.h"
37 #include "hash.h"
38 #include "normalizer2impl.h"
39 #include "toolutil.h"
40 #include "unewdata.h"
41 #include "utrie2.h"
42 #include "uvectr32.h"
43 #include "writesrc.h"
44 
45 #if !UCONFIG_NO_NORMALIZATION
46 
47 /* UDataInfo cf. udata.h */
48 static UDataInfo dataInfo={
49     sizeof(UDataInfo),
50     0,
51 
52     U_IS_BIG_ENDIAN,
53     U_CHARSET_FAMILY,
54     U_SIZEOF_UCHAR,
55     0,
56 
57     { 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
58     { 2, 0, 0, 0 },             /* formatVersion */
59     { 5, 2, 0, 0 }              /* dataVersion (Unicode version) */
60 };
61 
62 U_NAMESPACE_BEGIN
63 
64 class HangulIterator {
65 public:
66     struct Range {
67         UChar32 start, limit;
68         uint16_t norm16;
69     };
70 
HangulIterator()71     HangulIterator() : rangeIndex(0) {}
nextRange()72     const Range *nextRange() {
73         if(rangeIndex<UPRV_LENGTHOF(ranges)) {
74             return ranges+rangeIndex++;
75         } else {
76             return NULL;
77         }
78     }
reset()79     void reset() { rangeIndex=0; }
80 private:
81     static const Range ranges[4];
82     int32_t rangeIndex;
83 };
84 
85 const HangulIterator::Range HangulIterator::ranges[4]={
86     { Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 },
87     { Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT },
88     // JAMO_T_BASE+1: not U+11A7
89     { Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT },
90     { Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 },  // will become minYesNo
91 };
92 
93 struct CompositionPair {
CompositionPairCompositionPair94     CompositionPair(UChar32 t, UChar32 c) : trail(t), composite(c) {}
95     UChar32 trail, composite;
96 };
97 
98 struct Norm {
99     enum MappingType { NONE, REMOVED, ROUND_TRIP, ONE_WAY };
100 
hasMappingNorm101     UBool hasMapping() const { return mappingType>REMOVED; }
102 
103     // Requires hasMapping() and well-formed mapping.
setMappingCPNorm104     void setMappingCP() {
105         UChar32 c;
106         if(!mapping->isEmpty() && mapping->length()==U16_LENGTH(c=mapping->char32At(0))) {
107             mappingCP=c;
108         } else {
109             mappingCP=U_SENTINEL;
110         }
111     }
112 
getCompositionPairsNorm113     const CompositionPair *getCompositionPairs(int32_t &length) const {
114         if(compositions==NULL) {
115             length=0;
116             return NULL;
117         } else {
118             length=compositions->size()/2;
119             return reinterpret_cast<const CompositionPair *>(compositions->getBuffer());
120         }
121     }
122 
123     UnicodeString *mapping;
124     UnicodeString *rawMapping;  // non-NULL if the mapping is further decomposed
125     UChar32 mappingCP;  // >=0 if mapping to 1 code point
126     int32_t mappingPhase;
127     MappingType mappingType;
128 
129     UVector32 *compositions;  // (trail, composite) pairs
130     uint8_t cc;
131     UBool combinesBack;
132     UBool hasNoCompBoundaryAfter;
133 
134     enum OffsetType {
135         OFFSET_NONE,
136         // Composition for back-combining character. Allowed, but not normally used.
137         OFFSET_MAYBE_YES,
138         // Composition for a starter that does not have a decomposition mapping.
139         OFFSET_YES_YES,
140         // Round-trip mapping & composition for a starter.
141         OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
142         // Round-trip mapping for a starter that itself does not combine-forward.
143         OFFSET_YES_NO_MAPPING_ONLY,
144         // One-way mapping.
145         OFFSET_NO_NO,
146         // Delta for an algorithmic one-way mapping.
147         OFFSET_DELTA
148     };
149     enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
150     int32_t offset;
151 };
152 
153 class Normalizer2DBEnumerator {
154 public:
Normalizer2DBEnumerator(Normalizer2DataBuilder & b)155     Normalizer2DBEnumerator(Normalizer2DataBuilder &b) : builder(b) {}
~Normalizer2DBEnumerator()156     virtual ~Normalizer2DBEnumerator() {}
157     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) = 0;
ptr()158     Normalizer2DBEnumerator *ptr() { return this; }
159 protected:
160     Normalizer2DataBuilder &builder;
161 };
162 
163 U_CDECL_BEGIN
164 
165 static UBool U_CALLCONV
enumRangeHandler(const void * context,UChar32 start,UChar32 end,uint32_t value)166 enumRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
167     return ((Normalizer2DBEnumerator *)context)->rangeHandler(start, end, value);
168 }
169 
170 U_CDECL_END
171 
Normalizer2DataBuilder(UErrorCode & errorCode)172 Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
173         phase(0), overrideHandling(OVERRIDE_PREVIOUS), optimization(OPTIMIZE_NORMAL),
174         norm16TrieLength(0) {
175     memset(unicodeVersion, 0, sizeof(unicodeVersion));
176     normTrie=utrie2_open(0, 0, &errorCode);
177     normMem=utm_open("gennorm2 normalization structs", 10000, 0x110100, sizeof(Norm));
178     norms=allocNorm();  // unused Norm struct at index 0
179     memset(indexes, 0, sizeof(indexes));
180     memset(smallFCD, 0, sizeof(smallFCD));
181 }
182 
~Normalizer2DataBuilder()183 Normalizer2DataBuilder::~Normalizer2DataBuilder() {
184     utrie2_close(normTrie);
185     int32_t normsLength=utm_countItems(normMem);
186     for(int32_t i=1; i<normsLength; ++i) {
187         delete norms[i].mapping;
188         delete norms[i].rawMapping;
189         delete norms[i].compositions;
190     }
191     utm_close(normMem);
192     utrie2_close(norm16Trie);
193 }
194 
195 void
setUnicodeVersion(const char * v)196 Normalizer2DataBuilder::setUnicodeVersion(const char *v) {
197     UVersionInfo nullVersion={ 0, 0, 0, 0 };
198     UVersionInfo version;
199     u_versionFromString(version, v);
200     if( 0!=memcmp(version, unicodeVersion, U_MAX_VERSION_LENGTH) &&
201         0!=memcmp(nullVersion, unicodeVersion, U_MAX_VERSION_LENGTH)
202     ) {
203         char buffer[U_MAX_VERSION_STRING_LENGTH];
204         u_versionToString(unicodeVersion, buffer);
205         fprintf(stderr, "gennorm2 error: multiple inconsistent Unicode version numbers %s vs. %s\n",
206                 buffer, v);
207         exit(U_ILLEGAL_ARGUMENT_ERROR);
208     }
209     memcpy(unicodeVersion, version, U_MAX_VERSION_LENGTH);
210 }
211 
allocNorm()212 Norm *Normalizer2DataBuilder::allocNorm() {
213     Norm *p=(Norm *)utm_alloc(normMem);
214     norms=(Norm *)utm_getStart(normMem);  // in case it got reallocated
215     return p;
216 }
217 
218 /* get an existing Norm unit */
getNorm(UChar32 c)219 Norm *Normalizer2DataBuilder::getNorm(UChar32 c) {
220     uint32_t i=utrie2_get32(normTrie, c);
221     if(i==0) {
222         return NULL;
223     }
224     return norms+i;
225 }
226 
getNormRef(UChar32 c) const227 const Norm &Normalizer2DataBuilder::getNormRef(UChar32 c) const {
228     return norms[utrie2_get32(normTrie, c)];
229 }
230 
231 /*
232  * get or create a Norm unit;
233  * get or create the intermediate trie entries for it as well
234  */
createNorm(UChar32 c)235 Norm *Normalizer2DataBuilder::createNorm(UChar32 c) {
236     uint32_t i=utrie2_get32(normTrie, c);
237     if(i!=0) {
238         return norms+i;
239     } else {
240         /* allocate Norm */
241         Norm *p=allocNorm();
242         IcuToolErrorCode errorCode("gennorm2/createNorm()");
243         utrie2_set32(normTrie, c, (uint32_t)(p-norms), errorCode);
244         return p;
245     }
246 }
247 
checkNormForMapping(Norm * p,UChar32 c)248 Norm *Normalizer2DataBuilder::checkNormForMapping(Norm *p, UChar32 c) {
249     if(p!=NULL) {
250         if(p->mappingType!=Norm::NONE) {
251             if( overrideHandling==OVERRIDE_NONE ||
252                 (overrideHandling==OVERRIDE_PREVIOUS && p->mappingPhase==phase)
253             ) {
254                 fprintf(stderr,
255                         "error in gennorm2 phase %d: "
256                         "not permitted to override mapping for U+%04lX from phase %d\n",
257                         (int)phase, (long)c, (int)p->mappingPhase);
258                 exit(U_INVALID_FORMAT_ERROR);
259             }
260             delete p->mapping;
261             p->mapping=NULL;
262         }
263         p->mappingPhase=phase;
264     }
265     return p;
266 }
267 
setOverrideHandling(OverrideHandling oh)268 void Normalizer2DataBuilder::setOverrideHandling(OverrideHandling oh) {
269     overrideHandling=oh;
270     ++phase;
271 }
272 
setCC(UChar32 c,uint8_t cc)273 void Normalizer2DataBuilder::setCC(UChar32 c, uint8_t cc) {
274     createNorm(c)->cc=cc;
275 }
276 
getCC(UChar32 c) const277 uint8_t Normalizer2DataBuilder::getCC(UChar32 c) const {
278     return getNormRef(c).cc;
279 }
280 
isWellFormed(const UnicodeString & s)281 static UBool isWellFormed(const UnicodeString &s) {
282     UErrorCode errorCode=U_ZERO_ERROR;
283     u_strToUTF8(NULL, 0, NULL, s.getBuffer(), s.length(), &errorCode);
284     return U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR;
285 }
286 
setOneWayMapping(UChar32 c,const UnicodeString & m)287 void Normalizer2DataBuilder::setOneWayMapping(UChar32 c, const UnicodeString &m) {
288     if(!isWellFormed(m)) {
289         fprintf(stderr,
290                 "error in gennorm2 phase %d: "
291                 "illegal one-way mapping from U+%04lX to malformed string\n",
292                 (int)phase, (long)c);
293         exit(U_INVALID_FORMAT_ERROR);
294     }
295     Norm *p=checkNormForMapping(createNorm(c), c);
296     p->mapping=new UnicodeString(m);
297     p->mappingType=Norm::ONE_WAY;
298     p->setMappingCP();
299 }
300 
setRoundTripMapping(UChar32 c,const UnicodeString & m)301 void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString &m) {
302     if(U_IS_SURROGATE(c)) {
303         fprintf(stderr,
304                 "error in gennorm2 phase %d: "
305                 "illegal round-trip mapping from surrogate code point U+%04lX\n",
306                 (int)phase, (long)c);
307         exit(U_INVALID_FORMAT_ERROR);
308     }
309     if(!isWellFormed(m)) {
310         fprintf(stderr,
311                 "error in gennorm2 phase %d: "
312                 "illegal round-trip mapping from U+%04lX to malformed string\n",
313                 (int)phase, (long)c);
314         exit(U_INVALID_FORMAT_ERROR);
315     }
316     int32_t numCP=u_countChar32(m.getBuffer(), m.length());
317     if(numCP!=2) {
318         fprintf(stderr,
319                 "error in gennorm2 phase %d: "
320                 "illegal round-trip mapping from U+%04lX to %d!=2 code points\n",
321                 (int)phase, (long)c, (int)numCP);
322         exit(U_INVALID_FORMAT_ERROR);
323     }
324     Norm *p=checkNormForMapping(createNorm(c), c);
325     p->mapping=new UnicodeString(m);
326     p->mappingType=Norm::ROUND_TRIP;
327     p->mappingCP=U_SENTINEL;
328 }
329 
removeMapping(UChar32 c)330 void Normalizer2DataBuilder::removeMapping(UChar32 c) {
331     Norm *p=checkNormForMapping(getNorm(c), c);
332     if(p!=NULL) {
333         p->mappingType=Norm::REMOVED;
334     }
335 }
336 
337 class CompositionBuilder : public Normalizer2DBEnumerator {
338 public:
CompositionBuilder(Normalizer2DataBuilder & b)339     CompositionBuilder(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
rangeHandler(UChar32 start,UChar32 end,uint32_t value)340     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
341         builder.addComposition(start, end, value);
342         return TRUE;
343     }
344 };
345 
346 void
addComposition(UChar32 start,UChar32 end,uint32_t value)347 Normalizer2DataBuilder::addComposition(UChar32 start, UChar32 end, uint32_t value) {
348     if(norms[value].mappingType==Norm::ROUND_TRIP) {
349         if(start!=end) {
350             fprintf(stderr,
351                     "gennorm2 error: same round-trip mapping for "
352                     "more than 1 code point U+%04lX..U+%04lX\n",
353                     (long)start, (long)end);
354             exit(U_INVALID_FORMAT_ERROR);
355         }
356         if(norms[value].cc!=0) {
357             fprintf(stderr,
358                     "gennorm2 error: "
359                     "U+%04lX has a round-trip mapping and ccc!=0, "
360                     "not possible in Unicode normalization\n",
361                     (long)start);
362             exit(U_INVALID_FORMAT_ERROR);
363         }
364         // setRoundTripMapping() ensured that there are exactly two code points.
365         const UnicodeString &m=*norms[value].mapping;
366         UChar32 lead=m.char32At(0);
367         UChar32 trail=m.char32At(m.length()-1);
368         if(getCC(lead)!=0) {
369             fprintf(stderr,
370                     "gennorm2 error: "
371                     "U+%04lX's round-trip mapping's starter U+%04lX has ccc!=0, "
372                     "not possible in Unicode normalization\n",
373                     (long)start, (long)lead);
374             exit(U_INVALID_FORMAT_ERROR);
375         }
376         // Flag for trailing character.
377         createNorm(trail)->combinesBack=TRUE;
378         // Insert (trail, composite) pair into compositions list for the lead character.
379         IcuToolErrorCode errorCode("gennorm2/addComposition()");
380         Norm *leadNorm=createNorm(lead);
381         UVector32 *compositions=leadNorm->compositions;
382         int32_t i;
383         if(compositions==NULL) {
384             compositions=leadNorm->compositions=new UVector32(errorCode);
385             i=0;  // "insert" the first pair at index 0
386         } else {
387             // Insertion sort, and check for duplicate trail characters.
388             int32_t length;
389             const CompositionPair *pairs=leadNorm->getCompositionPairs(length);
390             for(i=0; i<length; ++i) {
391                 if(trail==pairs[i].trail) {
392                     fprintf(stderr,
393                             "gennorm2 error: same round-trip mapping for "
394                             "more than 1 code point (e.g., U+%04lX) to U+%04lX + U+%04lX\n",
395                             (long)start, (long)lead, (long)trail);
396                     exit(U_INVALID_FORMAT_ERROR);
397                 }
398                 if(trail<pairs[i].trail) {
399                     break;
400                 }
401             }
402         }
403         compositions->insertElementAt(trail, 2*i, errorCode);
404         compositions->insertElementAt(start, 2*i+1, errorCode);
405     }
406 }
407 
combinesWithCCBetween(const Norm & norm,uint8_t lowCC,uint8_t highCC) const408 UBool Normalizer2DataBuilder::combinesWithCCBetween(const Norm &norm,
409                                                     uint8_t lowCC, uint8_t highCC) const {
410     if((highCC-lowCC)>=2) {
411         int32_t length;
412         const CompositionPair *pairs=norm.getCompositionPairs(length);
413         for(int32_t i=0; i<length; ++i) {
414             uint8_t trailCC=getCC(pairs[i].trail);
415             if(lowCC<trailCC && trailCC<highCC) {
416                 return TRUE;
417             }
418         }
419     }
420     return FALSE;
421 }
422 
combine(const Norm & norm,UChar32 trail) const423 UChar32 Normalizer2DataBuilder::combine(const Norm &norm, UChar32 trail) const {
424     int32_t length;
425     const CompositionPair *pairs=norm.getCompositionPairs(length);
426     for(int32_t i=0; i<length; ++i) {
427         if(trail==pairs[i].trail) {
428             return pairs[i].composite;
429         }
430         if(trail<pairs[i].trail) {
431             break;
432         }
433     }
434     return U_SENTINEL;
435 }
436 
437 class Decomposer : public Normalizer2DBEnumerator {
438 public:
Decomposer(Normalizer2DataBuilder & b)439     Decomposer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b), didDecompose(FALSE) {}
rangeHandler(UChar32 start,UChar32 end,uint32_t value)440     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
441         didDecompose|=builder.decompose(start, end, value);
442         return TRUE;
443     }
444     UBool didDecompose;
445 };
446 
447 UBool
decompose(UChar32 start,UChar32 end,uint32_t value)448 Normalizer2DataBuilder::decompose(UChar32 start, UChar32 end, uint32_t value) {
449     if(norms[value].hasMapping()) {
450         Norm &norm=norms[value];
451         const UnicodeString &m=*norm.mapping;
452         UnicodeString *decomposed=NULL;
453         const UChar *s=m.getBuffer();
454         int32_t length=m.length();
455         int32_t prev, i=0;
456         UChar32 c;
457         while(i<length) {
458             prev=i;
459             U16_NEXT(s, i, length, c);
460             if(start<=c && c<=end) {
461                 fprintf(stderr,
462                         "gennorm2 error: U+%04lX maps to itself directly or indirectly\n",
463                         (long)c);
464                 exit(U_INVALID_FORMAT_ERROR);
465             }
466             const Norm &cNorm=getNormRef(c);
467             if(cNorm.hasMapping()) {
468                 if(norm.mappingType==Norm::ROUND_TRIP) {
469                     if(prev==0) {
470                         if(cNorm.mappingType!=Norm::ROUND_TRIP) {
471                             fprintf(stderr,
472                                     "gennorm2 error: "
473                                     "U+%04lX's round-trip mapping's starter "
474                                     "U+%04lX one-way-decomposes, "
475                                     "not possible in Unicode normalization\n",
476                                     (long)start, (long)c);
477                             exit(U_INVALID_FORMAT_ERROR);
478                         }
479                         uint8_t myTrailCC=getCC(m.char32At(i));
480                         UChar32 cTrailChar=cNorm.mapping->char32At(cNorm.mapping->length()-1);
481                         uint8_t cTrailCC=getCC(cTrailChar);
482                         if(cTrailCC>myTrailCC) {
483                             fprintf(stderr,
484                                     "gennorm2 error: "
485                                     "U+%04lX's round-trip mapping's starter "
486                                     "U+%04lX decomposes and the "
487                                     "inner/earlier tccc=%hu > outer/following tccc=%hu, "
488                                     "not possible in Unicode normalization\n",
489                                     (long)start, (long)c,
490                                     (short)cTrailCC, (short)myTrailCC);
491                             exit(U_INVALID_FORMAT_ERROR);
492                         }
493                     } else {
494                         fprintf(stderr,
495                                 "gennorm2 error: "
496                                 "U+%04lX's round-trip mapping's non-starter "
497                                 "U+%04lX decomposes, "
498                                 "not possible in Unicode normalization\n",
499                                 (long)start, (long)c);
500                         exit(U_INVALID_FORMAT_ERROR);
501                     }
502                 }
503                 if(decomposed==NULL) {
504                     decomposed=new UnicodeString(m, 0, prev);
505                 }
506                 decomposed->append(*cNorm.mapping);
507             } else if(Hangul::isHangul(c)) {
508                 UChar buffer[3];
509                 int32_t hangulLength=Hangul::decompose(c, buffer);
510                 if(norm.mappingType==Norm::ROUND_TRIP && prev!=0) {
511                     fprintf(stderr,
512                             "gennorm2 error: "
513                             "U+%04lX's round-trip mapping's non-starter "
514                             "U+%04lX decomposes, "
515                             "not possible in Unicode normalization\n",
516                             (long)start, (long)c);
517                     exit(U_INVALID_FORMAT_ERROR);
518                 }
519                 if(decomposed==NULL) {
520                     decomposed=new UnicodeString(m, 0, prev);
521                 }
522                 decomposed->append(buffer, hangulLength);
523             } else if(decomposed!=NULL) {
524                 decomposed->append(m, prev, i-prev);
525             }
526         }
527         if(decomposed!=NULL) {
528             if(norm.rawMapping==NULL) {
529                 // Remember the original mapping when decomposing recursively.
530                 norm.rawMapping=norm.mapping;
531             } else {
532                 delete norm.mapping;
533             }
534             norm.mapping=decomposed;
535             // Not  norm.setMappingCP();  because the original mapping
536             // is most likely to be encodable as a delta.
537             return TRUE;
538         }
539     }
540     return FALSE;
541 }
542 
543 class BuilderReorderingBuffer {
544 public:
BuilderReorderingBuffer()545     BuilderReorderingBuffer() : fLength(0), fLastStarterIndex(-1), fDidReorder(FALSE) {}
reset()546     void reset() {
547         fLength=0;
548         fLastStarterIndex=-1;
549         fDidReorder=FALSE;
550     }
length() const551     int32_t length() const { return fLength; }
isEmpty() const552     UBool isEmpty() const { return fLength==0; }
lastStarterIndex() const553     int32_t lastStarterIndex() const { return fLastStarterIndex; }
charAt(int32_t i) const554     UChar32 charAt(int32_t i) const { return fArray[i]>>8; }
ccAt(int32_t i) const555     uint8_t ccAt(int32_t i) const { return (uint8_t)fArray[i]; }
didReorder() const556     UBool didReorder() const { return fDidReorder; }
append(UChar32 c,uint8_t cc)557     void append(UChar32 c, uint8_t cc) {
558         if(cc==0 || fLength==0 || ccAt(fLength-1)<=cc) {
559             if(cc==0) {
560                 fLastStarterIndex=fLength;
561             }
562             fArray[fLength++]=(c<<8)|cc;
563             return;
564         }
565         // Let this character bubble back to its canonical order.
566         int32_t i=fLength-1;
567         while(i>fLastStarterIndex && ccAt(i)>cc) {
568             --i;
569         }
570         ++i;  // after the last starter or prevCC<=cc
571         // Move this and the following characters forward one to make space.
572         for(int32_t j=fLength; i<j; --j) {
573             fArray[j]=fArray[j-1];
574         }
575         fArray[i]=(c<<8)|cc;
576         ++fLength;
577         fDidReorder=TRUE;
578     }
toString(UnicodeString & dest)579     void toString(UnicodeString &dest) {
580         dest.remove();
581         for(int32_t i=0; i<fLength; ++i) {
582             dest.append(charAt(i));
583         }
584     }
setComposite(UChar32 composite,int32_t combMarkIndex)585     void setComposite(UChar32 composite, int32_t combMarkIndex) {
586         fArray[fLastStarterIndex]=composite<<8;
587         // Remove the combining mark that contributed to the composite.
588         --fLength;
589         while(combMarkIndex<fLength) {
590             fArray[combMarkIndex]=fArray[combMarkIndex+1];
591             ++combMarkIndex;
592         }
593     }
594 private:
595     int32_t fArray[Normalizer2Impl::MAPPING_LENGTH_MASK];
596     int32_t fLength;
597     int32_t fLastStarterIndex;
598     UBool fDidReorder;
599 };
600 
601 void
reorder(Norm * p,BuilderReorderingBuffer & buffer)602 Normalizer2DataBuilder::reorder(Norm *p, BuilderReorderingBuffer &buffer) {
603     UnicodeString &m=*p->mapping;
604     int32_t length=m.length();
605     if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
606         return;  // writeMapping() will complain about it and print the code point.
607     }
608     const UChar *s=m.getBuffer();
609     int32_t i=0;
610     UChar32 c;
611     while(i<length) {
612         U16_NEXT(s, i, length, c);
613         buffer.append(c, getCC(c));
614     }
615     if(buffer.didReorder()) {
616         buffer.toString(m);
617     }
618 }
619 
620 /*
621  * Computes the flag for the last code branch in Normalizer2Impl::hasCompBoundaryAfter().
622  * A starter character with a mapping does not have a composition boundary after it
623  * if the character itself combines-forward (which is tested by the caller of this function),
624  * or it is deleted (mapped to the empty string),
625  * or its mapping contains no starter,
626  * or the last starter combines-forward.
627  */
hasNoCompBoundaryAfter(BuilderReorderingBuffer & buffer)628 UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer) {
629     if(buffer.isEmpty()) {
630         return TRUE;  // maps-to-empty-string is no boundary of any kind
631     }
632     int32_t lastStarterIndex=buffer.lastStarterIndex();
633     if(lastStarterIndex<0) {
634         return TRUE;  // no starter
635     }
636     UChar32 starter=buffer.charAt(lastStarterIndex);
637     if( Hangul::isJamoL(starter) ||
638         (Hangul::isJamoV(starter) &&
639          0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))
640     ) {
641         // A Jamo leading consonant or an LV pair combines-forward if it is at the end,
642         // otherwise it is blocked.
643         return lastStarterIndex==buffer.length()-1;
644     }
645     // Note: There can be no Hangul syllable in the fully decomposed mapping.
646     const Norm *starterNorm=&getNormRef(starter);
647     if(starterNorm->compositions==NULL) {
648         return FALSE;  // the last starter does not combine forward
649     }
650     // Compose as far as possible, and see if further compositions are possible.
651     uint8_t prevCC=0;
652     for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length();) {
653         uint8_t cc=buffer.ccAt(combMarkIndex);  // !=0 because after last starter
654         if(combinesWithCCBetween(*starterNorm, prevCC, cc)) {
655             return TRUE;
656         }
657         if( prevCC<cc &&
658             (starter=combine(*starterNorm, buffer.charAt(combMarkIndex)))>=0
659         ) {
660             buffer.setComposite(starter, combMarkIndex);
661             starterNorm=&getNormRef(starter);
662             if(starterNorm->compositions==NULL) {
663                 return FALSE;  // the composite does not combine further
664             }
665         } else {
666             prevCC=cc;
667             ++combMarkIndex;
668         }
669     }
670     // TRUE if the final, forward-combining starter is at the end.
671     return prevCC==0;
672 }
673 
674 // Requires p->hasMapping().
675 // Returns the offset of the "first unit" from the beginning of the extraData for c.
676 // That is the same as the length of the optional data for the raw mapping and the ccc/lccc word.
writeMapping(UChar32 c,const Norm * p,UnicodeString & dataString)677 int32_t Normalizer2DataBuilder::writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString) {
678     UnicodeString &m=*p->mapping;
679     int32_t length=m.length();
680     if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
681         fprintf(stderr,
682                 "gennorm2 error: "
683                 "mapping for U+%04lX longer than maximum of %d\n",
684                 (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
685         exit(U_INVALID_FORMAT_ERROR);
686     }
687     int32_t leadCC, trailCC;
688     if(length==0) {
689         leadCC=trailCC=0;
690     } else {
691         leadCC=getCC(m.char32At(0));
692         trailCC=getCC(m.char32At(length-1));
693     }
694     if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && (p->cc!=0 || leadCC!=0)) {
695         fprintf(stderr,
696                 "gennorm2 error: "
697                 "U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
698                 (long)c);
699         exit(U_INVALID_FORMAT_ERROR);
700     }
701     // Write small-FCD data.
702     if((leadCC|trailCC)!=0) {
703         UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
704         smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
705     }
706     // Write the mapping & raw mapping extraData.
707     int32_t firstUnit=length|(trailCC<<8);
708     int32_t preMappingLength=0;
709     if(p->rawMapping!=NULL) {
710         UnicodeString &rm=*p->rawMapping;
711         int32_t rmLength=rm.length();
712         if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) {
713             fprintf(stderr,
714                     "gennorm2 error: "
715                     "raw mapping for U+%04lX longer than maximum of %d\n",
716                     (long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
717             exit(U_INVALID_FORMAT_ERROR);
718         }
719         UChar rm0=rm.charAt(0);
720         if( rmLength==length-1 &&
721             // 99: overlong substring lengths get pinned to remainder lengths anyway
722             0==rm.compare(1, 99, m, 2, 99) &&
723             rm0>Normalizer2Impl::MAPPING_LENGTH_MASK
724         ) {
725             // Compression:
726             // rawMapping=rm0+mapping.substring(2) -> store only rm0
727             //
728             // The raw mapping is the same as the final mapping after replacing
729             // the final mapping's first two code units with the raw mapping's first one.
730             // In this case, we store only that first unit, rm0.
731             // This helps with a few hundred mappings.
732             dataString.append(rm0);
733             preMappingLength=1;
734         } else {
735             // Store the raw mapping with its length.
736             dataString.append(rm);
737             dataString.append((UChar)rmLength);
738             preMappingLength=rmLength+1;
739         }
740         firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING;
741     }
742     int32_t cccLccc=p->cc|(leadCC<<8);
743     if(cccLccc!=0) {
744         dataString.append((UChar)cccLccc);
745         ++preMappingLength;
746         firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
747     }
748     if(p->hasNoCompBoundaryAfter) {
749         firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
750     }
751     dataString.append((UChar)firstUnit);
752     dataString.append(m);
753     return preMappingLength;
754 }
755 
756 // Requires p->compositions!=NULL.
writeCompositions(UChar32 c,const Norm * p,UnicodeString & dataString)757 void Normalizer2DataBuilder::writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString) {
758     if(p->cc!=0) {
759         fprintf(stderr,
760                 "gennorm2 error: "
761                 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n",
762                 (long)c);
763         exit(U_INVALID_FORMAT_ERROR);
764     }
765     int32_t length;
766     const CompositionPair *pairs=p->getCompositionPairs(length);
767     for(int32_t i=0; i<length; ++i) {
768         const CompositionPair &pair=pairs[i];
769         // 22 bits for the composite character and whether it combines forward.
770         UChar32 compositeAndFwd=pair.composite<<1;
771         if(getNormRef(pair.composite).compositions!=NULL) {
772             compositeAndFwd|=1;  // The composite character also combines-forward.
773         }
774         // Encode most pairs in two units and some in three.
775         int32_t firstUnit, secondUnit, thirdUnit;
776         if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) {
777             if(compositeAndFwd<=0xffff) {
778                 firstUnit=pair.trail<<1;
779                 secondUnit=compositeAndFwd;
780                 thirdUnit=-1;
781             } else {
782                 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE;
783                 secondUnit=compositeAndFwd>>16;
784                 thirdUnit=compositeAndFwd;
785             }
786         } else {
787             firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+
788                        (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))|
789                       Normalizer2Impl::COMP_1_TRIPLE;
790             secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)|
791                        (compositeAndFwd>>16);
792             thirdUnit=compositeAndFwd;
793         }
794         // Set the high bit of the first unit if this is the last composition pair.
795         if(i==(length-1)) {
796             firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE;
797         }
798         dataString.append((UChar)firstUnit).append((UChar)secondUnit);
799         if(thirdUnit>=0) {
800             dataString.append((UChar)thirdUnit);
801         }
802     }
803 }
804 
805 class ExtraDataWriter : public Normalizer2DBEnumerator {
806 public:
ExtraDataWriter(Normalizer2DataBuilder & b)807     ExtraDataWriter(Normalizer2DataBuilder &b) :
808         Normalizer2DBEnumerator(b),
809         yesYesCompositions(1000, (UChar32)0xffff, 2),  // 0=inert, 1=Jamo L, 2=start of compositions
810         yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {}  // 0=Hangul, 1=start of normal data
rangeHandler(UChar32 start,UChar32 end,uint32_t value)811     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
812         if(value!=0) {
813             if(start!=end) {
814                 fprintf(stderr,
815                         "gennorm2 error: unexpected shared data for "
816                         "multiple code points U+%04lX..U+%04lX\n",
817                         (long)start, (long)end);
818                 exit(U_INTERNAL_PROGRAM_ERROR);
819             }
820             builder.writeExtraData(start, value, *this);
821         }
822         return TRUE;
823     }
824     UnicodeString maybeYesCompositions;
825     UnicodeString yesYesCompositions;
826     UnicodeString yesNoMappingsAndCompositions;
827     UnicodeString yesNoMappingsOnly;
828     UnicodeString noNoMappings;
829     Hashtable previousNoNoMappings;  // If constructed in runtime code, pass in UErrorCode.
830 };
831 
writeExtraData(UChar32 c,uint32_t value,ExtraDataWriter & writer)832 void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer) {
833     Norm *p=norms+value;
834     if(!p->hasMapping()) {
835         // Write small-FCD data.
836         // There is similar code in writeMapping() for characters that do have a mapping.
837         if(c<Normalizer2Impl::MIN_CCC_LCCC_CP && p->cc!=0) {
838             fprintf(stderr,
839                     "gennorm2 error: "
840                     "U+%04lX below U+0300 has ccc!=0, not supported by ICU\n",
841                     (long)c);
842             exit(U_INVALID_FORMAT_ERROR);
843         }
844         if(p->cc!=0) {
845             UChar32 lead= c<=0xffff ? c : U16_LEAD(c);
846             smallFCD[lead>>8]|=(uint8_t)1<<((lead>>5)&7);
847         }
848     }
849     if(p->combinesBack) {
850         if(p->hasMapping()) {
851             fprintf(stderr,
852                     "gennorm2 error: "
853                     "U+%04lX combines-back and decomposes, not possible in Unicode normalization\n",
854                     (long)c);
855             exit(U_INVALID_FORMAT_ERROR);
856         }
857         if(p->compositions!=NULL) {
858             p->offset=
859                 (writer.maybeYesCompositions.length()<<Norm::OFFSET_SHIFT)|
860                 Norm::OFFSET_MAYBE_YES;
861             writeCompositions(c, p, writer.maybeYesCompositions);
862         }
863     } else if(!p->hasMapping()) {
864         if(p->compositions!=NULL) {
865             p->offset=
866                 (writer.yesYesCompositions.length()<<Norm::OFFSET_SHIFT)|
867                 Norm::OFFSET_YES_YES;
868             writeCompositions(c, p, writer.yesYesCompositions);
869         }
870     } else if(p->mappingType==Norm::ROUND_TRIP) {
871         if(p->compositions!=NULL) {
872             int32_t offset=writer.yesNoMappingsAndCompositions.length()+
873                            writeMapping(c, p, writer.yesNoMappingsAndCompositions);
874             p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
875             writeCompositions(c, p, writer.yesNoMappingsAndCompositions);
876         } else {
877             int32_t offset=writer.yesNoMappingsOnly.length()+
878                            writeMapping(c, p, writer.yesNoMappingsOnly);
879             p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
880         }
881     } else /* one-way */ {
882         if(p->compositions!=NULL) {
883             fprintf(stderr,
884                     "gennorm2 error: "
885                     "U+%04lX combines-forward and has a one-way mapping, "
886                     "not possible in Unicode normalization\n",
887                     (long)c);
888             exit(U_INVALID_FORMAT_ERROR);
889         }
890         if(p->cc==0 && optimization!=OPTIMIZE_FAST) {
891             // Try a compact, algorithmic encoding.
892             // Only for ccc=0, because we can't store additional information
893             // and we do not recursively follow an algorithmic encoding for access to the ccc.
894             //
895             // Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
896             // if the mappingCP decomposes further, to ensure that there is a place to store it.
897             // We want to see that the final mapping does not have exactly 1 code point,
898             // or else we would have to recursively ensure that the final mapping is stored
899             // in normal extraData.
900             if(p->mappingCP>=0 && (!p->hasNoCompBoundaryAfter || 1!=p->mapping->countChar32())) {
901                 int32_t delta=p->mappingCP-c;
902                 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
903                     p->offset=(delta<<Norm::OFFSET_SHIFT)|Norm::OFFSET_DELTA;
904                 }
905             }
906         }
907         if(p->offset==0) {
908             int32_t oldNoNoLength=writer.noNoMappings.length();
909             int32_t offset=oldNoNoLength+writeMapping(c, p, writer.noNoMappings);
910             UnicodeString newMapping=writer.noNoMappings.tempSubString(oldNoNoLength);
911             int32_t previousOffset=writer.previousNoNoMappings.geti(newMapping);
912             if(previousOffset!=0) {
913                 // Duplicate, remove the new units and point to the old ones.
914                 writer.noNoMappings.truncate(oldNoNoLength);
915                 p->offset=((previousOffset-1)<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
916             } else {
917                 // Enter this new mapping into the hashtable, avoiding value 0 which is "not found".
918                 IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.puti()");
919                 writer.previousNoNoMappings.puti(newMapping, offset+1, errorCode);
920                 p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_NO_NO;
921             }
922         }
923     }
924 }
925 
926 class Norm16Writer : public Normalizer2DBEnumerator {
927 public:
Norm16Writer(Normalizer2DataBuilder & b)928     Norm16Writer(Normalizer2DataBuilder &b) : Normalizer2DBEnumerator(b) {}
rangeHandler(UChar32 start,UChar32 end,uint32_t value)929     virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
930         builder.writeNorm16(start, end, value);
931         return TRUE;
932     }
933 };
934 
writeNorm16(UChar32 start,UChar32 end,uint32_t value)935 void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t value) {
936     if(value!=0) {
937         const Norm *p=norms+value;
938         int32_t offset=p->offset>>Norm::OFFSET_SHIFT;
939         int32_t norm16=0;
940         UBool isDecompNo=FALSE;
941         UBool isCompNoMaybe=FALSE;
942         switch(p->offset&Norm::OFFSET_MASK) {
943         case Norm::OFFSET_NONE:
944             // No mapping, no compositions list.
945             if(p->combinesBack) {
946                 norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+p->cc;
947                 isDecompNo=(UBool)(p->cc!=0);
948                 isCompNoMaybe=TRUE;
949             } else if(p->cc!=0) {
950                 norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+p->cc;
951                 isDecompNo=isCompNoMaybe=TRUE;
952             }
953             break;
954         case Norm::OFFSET_MAYBE_YES:
955             norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+offset;
956             isCompNoMaybe=TRUE;
957             break;
958         case Norm::OFFSET_YES_YES:
959             norm16=offset;
960             break;
961         case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
962             norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
963             isDecompNo=TRUE;
964             break;
965         case Norm::OFFSET_YES_NO_MAPPING_ONLY:
966             norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
967             isDecompNo=TRUE;
968             break;
969         case Norm::OFFSET_NO_NO:
970             norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
971             isDecompNo=isCompNoMaybe=TRUE;
972             break;
973         case Norm::OFFSET_DELTA:
974             norm16=getCenterNoNoDelta()+offset;
975             isDecompNo=isCompNoMaybe=TRUE;
976             break;
977         default:  // Should not occur.
978             exit(U_INTERNAL_PROGRAM_ERROR);
979         }
980         IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
981         utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
982         if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
983             indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
984         }
985         if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
986             indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
987         }
988     }
989 }
990 
setHangulData()991 void Normalizer2DataBuilder::setHangulData() {
992     HangulIterator hi;
993     const HangulIterator::Range *range;
994     // Check that none of the Hangul/Jamo code points have data.
995     while((range=hi.nextRange())!=NULL) {
996         for(UChar32 c=range->start; c<range->limit; ++c) {
997             if(utrie2_get32(norm16Trie, c)!=0) {
998                 fprintf(stderr,
999                         "gennorm2 error: "
1000                         "illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
1001                         (long)c);
1002                 exit(U_INVALID_FORMAT_ERROR);
1003             }
1004         }
1005     }
1006     // Set data for algorithmic runtime handling.
1007     IcuToolErrorCode errorCode("gennorm2/setHangulData()");
1008     hi.reset();
1009     while((range=hi.nextRange())!=NULL) {
1010         uint16_t norm16=range->norm16;
1011         if(norm16==0) {
1012             norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO];  // Hangul LV/LVT encoded as minYesNo
1013             if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
1014                 indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start;
1015             }
1016         } else {
1017             if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {  // Jamo V/T are maybeYes
1018                 indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start;
1019             }
1020         }
1021         utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode);
1022         errorCode.assertSuccess();
1023     }
1024 }
1025 
1026 U_CDECL_BEGIN
1027 
1028 static UBool U_CALLCONV
enumRangeMaxValue(const void * context,UChar32,UChar32,uint32_t value)1029 enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
1030     uint32_t *pMaxValue=(uint32_t *)context;
1031     if(value>*pMaxValue) {
1032         *pMaxValue=value;
1033     }
1034     return TRUE;
1035 }
1036 
1037 U_CDECL_END
1038 
processData()1039 void Normalizer2DataBuilder::processData() {
1040     IcuToolErrorCode errorCode("gennorm2/processData()");
1041     norm16Trie=utrie2_open(0, 0, errorCode);
1042     errorCode.assertSuccess();
1043 
1044     utrie2_enum(normTrie, NULL, enumRangeHandler, CompositionBuilder(*this).ptr());
1045 
1046     Decomposer decomposer(*this);
1047     do {
1048         decomposer.didDecompose=FALSE;
1049         utrie2_enum(normTrie, NULL, enumRangeHandler, &decomposer);
1050     } while(decomposer.didDecompose);
1051 
1052     BuilderReorderingBuffer buffer;
1053     int32_t normsLength=utm_countItems(normMem);
1054     for(int32_t i=1; i<normsLength; ++i) {
1055         // Set the hasNoCompBoundaryAfter flag for use by the last code branch
1056         // in Normalizer2Impl::hasCompBoundaryAfter().
1057         // For details see the comments on hasNoCompBoundaryAfter(buffer).
1058         const Norm &norm=norms[i];
1059         if(norm.hasMapping()) {
1060             if(norm.compositions!=NULL) {
1061                 norms[i].hasNoCompBoundaryAfter=TRUE;
1062             } else {
1063                 buffer.reset();
1064                 reorder(norms+i, buffer);
1065                 norms[i].hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
1066             }
1067         }
1068     }
1069 
1070     indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
1071     indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
1072 
1073     ExtraDataWriter extraDataWriter(*this);
1074     utrie2_enum(normTrie, NULL, enumRangeHandler, &extraDataWriter);
1075 
1076     extraData=extraDataWriter.maybeYesCompositions;
1077     extraData.append(extraDataWriter.yesYesCompositions).
1078               append(extraDataWriter.yesNoMappingsAndCompositions).
1079               append(extraDataWriter.yesNoMappingsOnly).
1080               append(extraDataWriter.noNoMappings);
1081     // Pad to even length for 4-byte alignment of following data.
1082     if(extraData.length()&1) {
1083         extraData.append((UChar)0);
1084     }
1085 
1086     indexes[Normalizer2Impl::IX_MIN_YES_NO]=
1087         extraDataWriter.yesYesCompositions.length();
1088     indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=
1089         indexes[Normalizer2Impl::IX_MIN_YES_NO]+
1090         extraDataWriter.yesNoMappingsAndCompositions.length();
1091     indexes[Normalizer2Impl::IX_MIN_NO_NO]=
1092         indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+
1093         extraDataWriter.yesNoMappingsOnly.length();
1094     indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
1095         indexes[Normalizer2Impl::IX_MIN_NO_NO]+
1096         extraDataWriter.noNoMappings.length();
1097     indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
1098         Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
1099         extraDataWriter.maybeYesCompositions.length();
1100 
1101     int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
1102     if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
1103         fprintf(stderr,
1104                 "gennorm2 error: "
1105                 "data structure overflow, too much mapping composition data\n");
1106         exit(U_BUFFER_OVERFLOW_ERROR);
1107     }
1108 
1109     utrie2_enum(normTrie, NULL, enumRangeHandler, Norm16Writer(*this).ptr());
1110 
1111     setHangulData();
1112 
1113     // Look for the "worst" norm16 value of any supplementary code point
1114     // corresponding to a lead surrogate, and set it as that surrogate's value.
1115     // Enables quick check inner loops to look at only code units.
1116     //
1117     // We could be more sophisticated:
1118     // We could collect a bit set for whether there are values in the different
1119     // norm16 ranges (yesNo, maybeYes, yesYesWithCC etc.)
1120     // and select the best value that only breaks the composition and/or decomposition
1121     // inner loops if necessary.
1122     // However, that seems like overkill for an optimization for supplementary characters.
1123     for(UChar lead=0xd800; lead<0xdc00; ++lead) {
1124         uint32_t maxValue=utrie2_get32(norm16Trie, lead);
1125         utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue);
1126         if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] &&
1127             maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]
1128         ) {
1129             // Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
1130             // Otherwise it might end up at something like JAMO_VT which stays in
1131             // the inner decomposition quick check loop.
1132             maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1;
1133         }
1134         utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode);
1135     }
1136 
1137     // Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
1138     // For an empty data file, minCP=0x110000 turns into 0xdc00 (first trail surrogate)
1139     // which is harmless.
1140     // As a result, the minimum code points are always BMP code points.
1141     int32_t minCP=indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP];
1142     if(minCP>=0x10000) {
1143         indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=U16_LEAD(minCP);
1144     }
1145     minCP=indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP];
1146     if(minCP>=0x10000) {
1147         indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
1148     }
1149 
1150     utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode);
1151     norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode);
1152     if(errorCode.get()!=U_BUFFER_OVERFLOW_ERROR) {
1153         fprintf(stderr, "gennorm2 error: unable to freeze/serialize the normalization trie - %s\n",
1154                 errorCode.errorName());
1155         exit(errorCode.reset());
1156     }
1157     errorCode.reset();
1158 
1159     int32_t offset=(int32_t)sizeof(indexes);
1160     indexes[Normalizer2Impl::IX_NORM_TRIE_OFFSET]=offset;
1161     offset+=norm16TrieLength;
1162     indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET]=offset;
1163     offset+=extraData.length()*2;
1164     indexes[Normalizer2Impl::IX_SMALL_FCD_OFFSET]=offset;
1165     offset+=sizeof(smallFCD);
1166     int32_t totalSize=offset;
1167     for(int32_t i=Normalizer2Impl::IX_RESERVED3_OFFSET; i<=Normalizer2Impl::IX_TOTAL_SIZE; ++i) {
1168         indexes[i]=totalSize;
1169     }
1170 
1171     if(beVerbose) {
1172         printf("size of normalization trie:         %5ld bytes\n", (long)norm16TrieLength);
1173         printf("size of 16-bit extra data:          %5ld uint16_t\n", (long)extraData.length());
1174         printf("size of small-FCD data:             %5ld bytes\n", (long)sizeof(smallFCD));
1175         printf("size of binary data file contents:  %5ld bytes\n", (long)totalSize);
1176         printf("minDecompNoCodePoint:              U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
1177         printf("minCompNoMaybeCodePoint:           U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
1178         printf("minYesNo:                          0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
1179         printf("minYesNoMappingsOnly:              0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
1180         printf("minNoNo:                           0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
1181         printf("limitNoNo:                         0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
1182         printf("minMaybeYes:                       0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
1183     }
1184 
1185     UVersionInfo nullVersion={ 0, 0, 0, 0 };
1186     if(0==memcmp(nullVersion, unicodeVersion, 4)) {
1187         u_versionFromString(unicodeVersion, U_UNICODE_VERSION);
1188     }
1189     memcpy(dataInfo.dataVersion, unicodeVersion, 4);
1190 }
1191 
writeBinaryFile(const char * filename)1192 void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
1193     processData();
1194 
1195     IcuToolErrorCode errorCode("gennorm2/writeBinaryFile()");
1196     LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
1197     utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
1198     errorCode.assertSuccess();
1199 
1200     UNewDataMemory *pData=
1201         udata_create(NULL, NULL, filename, &dataInfo,
1202                      haveCopyright ? U_COPYRIGHT_STRING : NULL, errorCode);
1203     if(errorCode.isFailure()) {
1204         fprintf(stderr, "gennorm2 error: unable to create the output file %s - %s\n",
1205                 filename, errorCode.errorName());
1206         exit(errorCode.reset());
1207     }
1208     udata_writeBlock(pData, indexes, sizeof(indexes));
1209     udata_writeBlock(pData, norm16TrieBytes.getAlias(), norm16TrieLength);
1210     udata_writeUString(pData, extraData.getBuffer(), extraData.length());
1211     udata_writeBlock(pData, smallFCD, sizeof(smallFCD));
1212     int32_t writtenSize=udata_finish(pData, errorCode);
1213     if(errorCode.isFailure()) {
1214         fprintf(stderr, "gennorm2: error %s writing the output file\n", errorCode.errorName());
1215         exit(errorCode.reset());
1216     }
1217     int32_t totalSize=indexes[Normalizer2Impl::IX_TOTAL_SIZE];
1218     if(writtenSize!=totalSize) {
1219         fprintf(stderr, "gennorm2 error: written size %ld != calculated size %ld\n",
1220             (long)writtenSize, (long)totalSize);
1221         exit(U_INTERNAL_PROGRAM_ERROR);
1222     }
1223 }
1224 
1225 void
writeCSourceFile(const char * filename)1226 Normalizer2DataBuilder::writeCSourceFile(const char *filename) {
1227     processData();
1228 
1229     IcuToolErrorCode errorCode("gennorm2/writeCSourceFile()");
1230     const char *basename=findBasename(filename);
1231     CharString path(filename, (int32_t)(basename-filename), errorCode);
1232     CharString dataName(basename, errorCode);
1233     const char *extension=strrchr(basename, '.');
1234     if(extension!=NULL) {
1235         dataName.truncate((int32_t)(extension-basename));
1236     }
1237     errorCode.assertSuccess();
1238 
1239     LocalArray<uint8_t> norm16TrieBytes(new uint8_t[norm16TrieLength]);
1240     utrie2_serialize(norm16Trie, norm16TrieBytes.getAlias(), norm16TrieLength, errorCode);
1241     errorCode.assertSuccess();
1242 
1243     FILE *f=usrc_create(path.data(), basename, "icu/source/tools/gennorm2/n2builder.cpp");
1244     if(f==NULL) {
1245         fprintf(stderr, "gennorm2/writeCSourceFile() error: unable to create the output file %s\n",
1246                 filename);
1247         exit(U_FILE_ACCESS_ERROR);
1248         return;
1249     }
1250     char line[100];
1251     sprintf(line, "static const UVersionInfo %s_formatVersion={", dataName.data());
1252     usrc_writeArray(f, line, dataInfo.formatVersion, 8, 4, "};\n");
1253     sprintf(line, "static const UVersionInfo %s_dataVersion={", dataName.data());
1254     usrc_writeArray(f, line, dataInfo.dataVersion, 8, 4, "};\n\n");
1255     sprintf(line, "static const int32_t %s_indexes[Normalizer2Impl::IX_COUNT]={\n",
1256             dataName.data());
1257     usrc_writeArray(f,
1258         line,
1259         indexes, 32, Normalizer2Impl::IX_COUNT,
1260         "\n};\n\n");
1261     sprintf(line, "static const uint16_t %s_trieIndex[%%ld]={\n", dataName.data());
1262     usrc_writeUTrie2Arrays(f,
1263         line, NULL,
1264         norm16Trie,
1265         "\n};\n\n");
1266     sprintf(line, "static const uint16_t %s_extraData[%%ld]={\n", dataName.data());
1267     usrc_writeArray(f,
1268         line,
1269         extraData.getBuffer(), 16, extraData.length(),
1270         "\n};\n\n");
1271     sprintf(line, "static const uint8_t %s_smallFCD[%%ld]={\n", dataName.data());
1272     usrc_writeArray(f,
1273         line,
1274         smallFCD, 8, sizeof(smallFCD),
1275         "\n};\n\n");
1276     /*fputs(  // TODO
1277         "static const UCaseProps %s_singleton={\n"
1278         "  NULL,\n"
1279         "  %s_indexes,\n"
1280         "  %s_extraData,\n"
1281         "  %s_smallFCD,\n",
1282         f);*/
1283     sprintf(line, "static const UTrie2 %s_trie={\n", dataName.data());
1284     char line2[100];
1285     sprintf(line2, "%s_trieIndex", dataName.data());
1286     usrc_writeUTrie2Struct(f,
1287         line,
1288         norm16Trie, line2, NULL,
1289         "};\n");
1290     fclose(f);
1291 }
1292 
1293 U_NAMESPACE_END
1294 
1295 #endif /* #if !UCONFIG_NO_NORMALIZATION */
1296 
1297 /*
1298  * Hey, Emacs, please set the following:
1299  *
1300  * Local Variables:
1301  * indent-tabs-mode: nil
1302  * End:
1303  */
1304