1 /*
2  *******************************************************************************
3  *   Copyright (C) 2009-2014, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  *******************************************************************************
6  */
7 
8 package com.ibm.icu.impl;
9 
10 import java.io.IOException;
11 import java.nio.ByteBuffer;
12 import java.util.ArrayList;
13 import java.util.Iterator;
14 
15 import com.ibm.icu.text.UTF16;
16 import com.ibm.icu.text.UnicodeSet;
17 import com.ibm.icu.util.ICUUncheckedIOException;
18 import com.ibm.icu.util.VersionInfo;
19 
20 public final class Normalizer2Impl {
21     public static final class Hangul {
22         /* Korean Hangul and Jamo constants */
23         public static final int JAMO_L_BASE=0x1100;     /* "lead" jamo */
24         public static final int JAMO_L_END=0x1112;
25         public static final int JAMO_V_BASE=0x1161;     /* "vowel" jamo */
26         public static final int JAMO_V_END=0x1175;
27         public static final int JAMO_T_BASE=0x11a7;     /* "trail" jamo */
28         public static final int JAMO_T_END=0x11c2;
29 
30         public static final int HANGUL_BASE=0xac00;
31         public static final int HANGUL_END=0xd7a3;
32 
33         public static final int JAMO_L_COUNT=19;
34         public static final int JAMO_V_COUNT=21;
35         public static final int JAMO_T_COUNT=28;
36 
37         public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT;
38         public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT;
39 
40         public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT;
41 
42         public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
43         public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
44 
isHangul(int c)45         public static boolean isHangul(int c) {
46             return HANGUL_BASE<=c && c<HANGUL_LIMIT;
47         }
isHangulWithoutJamoT(char c)48         public static boolean isHangulWithoutJamoT(char c) {
49             c-=HANGUL_BASE;
50             return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
51         }
isJamoL(int c)52         public static boolean isJamoL(int c) {
53             return JAMO_L_BASE<=c && c<JAMO_L_LIMIT;
54         }
isJamoV(int c)55         public static boolean isJamoV(int c) {
56             return JAMO_V_BASE<=c && c<JAMO_V_LIMIT;
57         }
58 
59         /**
60          * Decomposes c, which must be a Hangul syllable, into buffer
61          * and returns the length of the decomposition (2 or 3).
62          */
decompose(int c, Appendable buffer)63         public static int decompose(int c, Appendable buffer) {
64             try {
65                 c-=HANGUL_BASE;
66                 int c2=c%JAMO_T_COUNT;
67                 c/=JAMO_T_COUNT;
68                 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
69                 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
70                 if(c2==0) {
71                     return 2;
72                 } else {
73                     buffer.append((char)(JAMO_T_BASE+c2));
74                     return 3;
75                 }
76             } catch(IOException e) {
77                 // Will not occur because we do not write to I/O.
78                 throw new ICUUncheckedIOException(e);
79             }
80         }
81 
82         /**
83          * Decomposes c, which must be a Hangul syllable, into buffer.
84          * This is the raw, not recursive, decomposition. Its length is always 2.
85          */
getRawDecomposition(int c, Appendable buffer)86         public static void getRawDecomposition(int c, Appendable buffer) {
87             try {
88                 int orig=c;
89                 c-=HANGUL_BASE;
90                 int c2=c%JAMO_T_COUNT;
91                 if(c2==0) {
92                     c/=JAMO_T_COUNT;
93                     buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT));
94                     buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT));
95                 } else {
96                     buffer.append((char)(orig-c2));  // LV syllable
97                     buffer.append((char)(JAMO_T_BASE+c2));
98                 }
99             } catch(IOException e) {
100                 // Will not occur because we do not write to I/O.
101                 throw new ICUUncheckedIOException(e);
102             }
103         }
104     }
105 
106     /**
107      * Writable buffer that takes care of canonical ordering.
108      * Its Appendable methods behave like the C++ implementation's
109      * appendZeroCC() methods.
110      * <p>
111      * If dest is a StringBuilder, then the buffer writes directly to it.
112      * Otherwise, the buffer maintains a StringBuilder for intermediate text segments
113      * until no further changes are necessary and whole segments are appended.
114      * append() methods that take combining-class values always write to the StringBuilder.
115      * Other append() methods flush and append to the Appendable.
116      */
117     public static final class ReorderingBuffer implements Appendable {
ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity)118         public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) {
119             impl=ni;
120             app=dest;
121             if(app instanceof StringBuilder) {
122                 appIsStringBuilder=true;
123                 str=(StringBuilder)dest;
124                 // In Java, the constructor subsumes public void init(int destCapacity) {
125                 str.ensureCapacity(destCapacity);
126                 reorderStart=0;
127                 if(str.length()==0) {
128                     lastCC=0;
129                 } else {
130                     setIterator();
131                     lastCC=previousCC();
132                     // Set reorderStart after the last code point with cc<=1 if there is one.
133                     if(lastCC>1) {
134                         while(previousCC()>1) {}
135                     }
136                     reorderStart=codePointLimit;
137                 }
138             } else {
139                 appIsStringBuilder=false;
140                 str=new StringBuilder();
141                 reorderStart=0;
142                 lastCC=0;
143             }
144         }
145 
isEmpty()146         public boolean isEmpty() { return str.length()==0; }
length()147         public int length() { return str.length(); }
getLastCC()148         public int getLastCC() { return lastCC; }
149 
getStringBuilder()150         public StringBuilder getStringBuilder() { return str; }
151 
equals(CharSequence s, int start, int limit)152         public boolean equals(CharSequence s, int start, int limit) {
153             return UTF16Plus.equal(str, 0, str.length(), s, start, limit);
154         }
155 
156         // For Hangul composition, replacing the Leading consonant Jamo with the syllable.
setLastChar(char c)157         public void setLastChar(char c) {
158             str.setCharAt(str.length()-1, c);
159         }
160 
append(int c, int cc)161         public void append(int c, int cc) {
162             if(lastCC<=cc || cc==0) {
163                 str.appendCodePoint(c);
164                 lastCC=cc;
165                 if(cc<=1) {
166                     reorderStart=str.length();
167                 }
168             } else {
169                 insert(c, cc);
170             }
171         }
172         // s must be in NFD, otherwise change the implementation.
append(CharSequence s, int start, int limit, int leadCC, int trailCC)173         public void append(CharSequence s, int start, int limit,
174                            int leadCC, int trailCC) {
175             if(start==limit) {
176                 return;
177             }
178             if(lastCC<=leadCC || leadCC==0) {
179                 if(trailCC<=1) {
180                     reorderStart=str.length()+(limit-start);
181                 } else if(leadCC<=1) {
182                     reorderStart=str.length()+1;  // Ok if not a code point boundary.
183                 }
184                 str.append(s, start, limit);
185                 lastCC=trailCC;
186             } else {
187                 int c=Character.codePointAt(s, start);
188                 start+=Character.charCount(c);
189                 insert(c, leadCC);  // insert first code point
190                 while(start<limit) {
191                     c=Character.codePointAt(s, start);
192                     start+=Character.charCount(c);
193                     if(start<limit) {
194                         // s must be in NFD, otherwise we need to use getCC().
195                         leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
196                     } else {
197                         leadCC=trailCC;
198                     }
199                     append(c, leadCC);
200                 }
201             }
202         }
203         // The following append() methods work like C++ appendZeroCC().
204         // They assume that the cc or trailCC of their input is 0.
205         // Most of them implement Appendable interface methods.
206         // @Override when we switch to Java 6
append(char c)207         public ReorderingBuffer append(char c) {
208             str.append(c);
209             lastCC=0;
210             reorderStart=str.length();
211             return this;
212         }
appendZeroCC(int c)213         public void appendZeroCC(int c) {
214             str.appendCodePoint(c);
215             lastCC=0;
216             reorderStart=str.length();
217         }
218         // @Override when we switch to Java 6
append(CharSequence s)219         public ReorderingBuffer append(CharSequence s) {
220             if(s.length()!=0) {
221                 str.append(s);
222                 lastCC=0;
223                 reorderStart=str.length();
224             }
225             return this;
226         }
227         // @Override when we switch to Java 6
append(CharSequence s, int start, int limit)228         public ReorderingBuffer append(CharSequence s, int start, int limit) {
229             if(start!=limit) {
230                 str.append(s, start, limit);
231                 lastCC=0;
232                 reorderStart=str.length();
233             }
234             return this;
235         }
236         /**
237          * Flushes from the intermediate StringBuilder to the Appendable,
238          * if they are different objects.
239          * Used after recomposition.
240          * Must be called at the end when writing to a non-StringBuilder Appendable.
241          */
flush()242         public void flush() {
243             if(appIsStringBuilder) {
244                 reorderStart=str.length();
245             } else {
246                 try {
247                     app.append(str);
248                     str.setLength(0);
249                     reorderStart=0;
250                 } catch(IOException e) {
251                     throw new ICUUncheckedIOException(e);  // Avoid declaring "throws IOException".
252                 }
253             }
254             lastCC=0;
255         }
256         /**
257          * Flushes from the intermediate StringBuilder to the Appendable,
258          * if they are different objects.
259          * Then appends the new text to the Appendable or StringBuilder.
260          * Normally used after quick check loops find a non-empty sequence.
261          */
flushAndAppendZeroCC(CharSequence s, int start, int limit)262         public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) {
263             if(appIsStringBuilder) {
264                 str.append(s, start, limit);
265                 reorderStart=str.length();
266             } else {
267                 try {
268                     app.append(str).append(s, start, limit);
269                     str.setLength(0);
270                     reorderStart=0;
271                 } catch(IOException e) {
272                     throw new ICUUncheckedIOException(e);  // Avoid declaring "throws IOException".
273                 }
274             }
275             lastCC=0;
276             return this;
277         }
remove()278         public void remove() {
279             str.setLength(0);
280             lastCC=0;
281             reorderStart=0;
282         }
removeSuffix(int suffixLength)283         public void removeSuffix(int suffixLength) {
284             int oldLength=str.length();
285             str.delete(oldLength-suffixLength, oldLength);
286             lastCC=0;
287             reorderStart=str.length();
288         }
289 
290         /*
291          * TODO: Revisit whether it makes sense to track reorderStart.
292          * It is set to after the last known character with cc<=1,
293          * which stops previousCC() before it reads that character and looks up its cc.
294          * previousCC() is normally only called from insert().
295          * In other words, reorderStart speeds up the insertion of a combining mark
296          * into a multi-combining mark sequence where it does not belong at the end.
297          * This might not be worth the trouble.
298          * On the other hand, it's not a huge amount of trouble.
299          *
300          * We probably need it for UNORM_SIMPLE_APPEND.
301          */
302 
303         // Inserts c somewhere before the last character.
304         // Requires 0<cc<lastCC which implies reorderStart<limit.
insert(int c, int cc)305         private void insert(int c, int cc) {
306             for(setIterator(), skipPrevious(); previousCC()>cc;) {}
307             // insert c at codePointLimit, after the character with prevCC<=cc
308             if(c<=0xffff) {
309                 str.insert(codePointLimit, (char)c);
310                 if(cc<=1) {
311                     reorderStart=codePointLimit+1;
312                 }
313             } else {
314                 str.insert(codePointLimit, Character.toChars(c));
315                 if(cc<=1) {
316                     reorderStart=codePointLimit+2;
317                 }
318             }
319         }
320 
321         private final Normalizer2Impl impl;
322         private final Appendable app;
323         private final StringBuilder str;
324         private final boolean appIsStringBuilder;
325         private int reorderStart;
326         private int lastCC;
327 
328         // private backward iterator
setIterator()329         private void setIterator() { codePointStart=str.length(); }
skipPrevious()330         private void skipPrevious() {  // Requires 0<codePointStart.
331             codePointLimit=codePointStart;
332             codePointStart=str.offsetByCodePoints(codePointStart, -1);
333         }
previousCC()334         private int previousCC() {  // Returns 0 if there is no previous character.
335             codePointLimit=codePointStart;
336             if(reorderStart>=codePointStart) {
337                 return 0;
338             }
339             int c=str.codePointBefore(codePointStart);
340             codePointStart-=Character.charCount(c);
341             if(c<MIN_CCC_LCCC_CP) {
342                 return 0;
343             }
344             return getCCFromYesOrMaybe(impl.getNorm16(c));
345         }
346 
347         private int codePointStart, codePointLimit;
348     }
349 
350     // TODO: Propose as public API on the UTF16 class.
351     // TODO: Propose widening UTF16 methods that take char to take int.
352     // TODO: Propose widening UTF16 methods that take String to take CharSequence.
353     public static final class UTF16Plus {
354         /**
355          * Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
356          * is it a lead surrogate?
357          * @param c code unit or code point
358          * @return true or false
359          */
isSurrogateLead(int c)360         public static boolean isSurrogateLead(int c) { return (c&0x400)==0; }
361         /**
362          * Compares two CharSequence objects for binary equality.
363          * @param s1 first sequence
364          * @param s2 second sequence
365          * @return true if s1 contains the same text as s2
366          */
equal(CharSequence s1, CharSequence s2)367         public static boolean equal(CharSequence s1,  CharSequence s2) {
368             if(s1==s2) {
369                 return true;
370             }
371             int length=s1.length();
372             if(length!=s2.length()) {
373                 return false;
374             }
375             for(int i=0; i<length; ++i) {
376                 if(s1.charAt(i)!=s2.charAt(i)) {
377                     return false;
378                 }
379             }
380             return true;
381         }
382         /**
383          * Compares two CharSequence subsequences for binary equality.
384          * @param s1 first sequence
385          * @param start1 start offset in first sequence
386          * @param limit1 limit offset in first sequence
387          * @param s2 second sequence
388          * @param start2 start offset in second sequence
389          * @param limit2 limit offset in second sequence
390          * @return true if s1.subSequence(start1, limit1) contains the same text
391          *              as s2.subSequence(start2, limit2)
392          */
equal(CharSequence s1, int start1, int limit1, CharSequence s2, int start2, int limit2)393         public static boolean equal(CharSequence s1, int start1, int limit1,
394                                     CharSequence s2, int start2, int limit2) {
395             if((limit1-start1)!=(limit2-start2)) {
396                 return false;
397             }
398             if(s1==s2 && start1==start2) {
399                 return true;
400             }
401             while(start1<limit1) {
402                 if(s1.charAt(start1++)!=s2.charAt(start2++)) {
403                     return false;
404                 }
405             }
406             return true;
407         }
408     }
409 
Normalizer2Impl()410     public Normalizer2Impl() {}
411 
412     private static final class IsAcceptable implements ICUBinary.Authenticate {
413         // @Override when we switch to Java 6
isDataVersionAcceptable(byte version[])414         public boolean isDataVersionAcceptable(byte version[]) {
415             return version[0]==2;
416         }
417     }
418     private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
419     private static final int DATA_FORMAT = 0x4e726d32;  // "Nrm2"
420 
load(ByteBuffer bytes)421     public Normalizer2Impl load(ByteBuffer bytes) {
422         try {
423             dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE);
424             int indexesLength=bytes.getInt()/4;  // inIndexes[IX_NORM_TRIE_OFFSET]/4
425             if(indexesLength<=IX_MIN_MAYBE_YES) {
426                 throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes");
427             }
428             int[] inIndexes=new int[indexesLength];
429             inIndexes[0]=indexesLength*4;
430             for(int i=1; i<indexesLength; ++i) {
431                 inIndexes[i]=bytes.getInt();
432             }
433 
434             minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
435             minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
436 
437             minYesNo=inIndexes[IX_MIN_YES_NO];
438             minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
439             minNoNo=inIndexes[IX_MIN_NO_NO];
440             limitNoNo=inIndexes[IX_LIMIT_NO_NO];
441             minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
442 
443             // Read the normTrie.
444             int offset=inIndexes[IX_NORM_TRIE_OFFSET];
445             int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
446             normTrie=Trie2_16.createFromSerialized(bytes);
447             int trieLength=normTrie.getSerializedLength();
448             if(trieLength>(nextOffset-offset)) {
449                 throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie");
450             }
451             ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength);  // skip padding after trie bytes
452 
453             // Read the composition and mapping data.
454             offset=nextOffset;
455             nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
456             int numChars=(nextOffset-offset)/2;
457             char[] chars;
458             if(numChars!=0) {
459                 chars=new char[numChars];
460                 for(int i=0; i<numChars; ++i) {
461                     chars[i]=bytes.getChar();
462                 }
463                 maybeYesCompositions=new String(chars);
464                 extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes);
465             }
466 
467             // smallFCD: new in formatVersion 2
468             offset=nextOffset;
469             smallFCD=new byte[0x100];
470             for(int i=0; i<0x100; ++i) {
471                 smallFCD[i]=bytes.get();
472             }
473 
474             // Build tccc180[].
475             // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300.
476             tccc180=new int[0x180];
477             int bits=0;
478             for(int c=0; c<0x180; bits>>=1) {
479                 if((c&0xff)==0) {
480                     bits=smallFCD[c>>8];  // one byte per 0x100 code points
481                 }
482                 if((bits&1)!=0) {
483                     for(int i=0; i<0x20; ++i, ++c) {
484                         tccc180[c]=getFCD16FromNormData(c)&0xff;
485                     }
486                 } else {
487                     c+=0x20;
488                 }
489             }
490 
491             return this;
492         } catch(IOException e) {
493             throw new ICUUncheckedIOException(e);
494         }
495     }
load(String name)496     public Normalizer2Impl load(String name) {
497         return load(ICUBinary.getRequiredData(name));
498     }
499 
enumLcccRange(int start, int end, int norm16, UnicodeSet set)500     private void enumLcccRange(int start, int end, int norm16, UnicodeSet set) {
501         if(isAlgorithmicNoNo(norm16)) {
502             // Range of code points with same-norm16-value algorithmic decompositions.
503             // They might have different non-zero FCD16 values.
504             do {
505                 int fcd16=getFCD16(start);
506                 if(fcd16>0xff) { set.add(start); }
507             } while(++start<=end);
508         } else {
509             int fcd16=getFCD16(start);
510             if(fcd16>0xff) { set.add(start, end); }
511         }
512     }
513 
enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set)514     private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) {
515         /* add the start code point to the USet */
516         set.add(start);
517         if(start!=end && isAlgorithmicNoNo(value)) {
518             // Range of code points with same-norm16-value algorithmic decompositions.
519             // They might have different non-zero FCD16 values.
520             int prevFCD16=getFCD16(start);
521             while(++start<=end) {
522                 int fcd16=getFCD16(start);
523                 if(fcd16!=prevFCD16) {
524                     set.add(start);
525                     prevFCD16=fcd16;
526                 }
527             }
528         }
529     }
530 
addLcccChars(UnicodeSet set)531     public void addLcccChars(UnicodeSet set) {
532         /* add the start code point of each same-value range of each trie */
533         Iterator<Trie2.Range> trieIterator=normTrie.iterator();
534         Trie2.Range range;
535         while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
536             enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set);
537         }
538     }
539 
addPropertyStarts(UnicodeSet set)540     public void addPropertyStarts(UnicodeSet set) {
541         /* add the start code point of each same-value range of each trie */
542         Iterator<Trie2.Range> trieIterator=normTrie.iterator();
543         Trie2.Range range;
544         while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
545             enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set);
546         }
547 
548         /* add Hangul LV syllables and LV+1 because of skippables */
549         for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) {
550             set.add(c);
551             set.add(c+1);
552         }
553         set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */
554     }
555 
addCanonIterPropertyStarts(UnicodeSet set)556     public void addCanonIterPropertyStarts(UnicodeSet set) {
557         /* add the start code point of each same-value range of the canonical iterator data trie */
558         ensureCanonIterData();
559         // currently only used for the SEGMENT_STARTER property
560         Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper);
561         Trie2.Range range;
562         while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
563             /* add the start code point to the USet */
564             set.add(range.startCodePoint);
565         }
566     }
567     private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() {
568         public int map(int in) {
569             return in&CANON_NOT_SEGMENT_STARTER;
570         }
571     };
572 
573     // low-level properties ------------------------------------------------ ***
574 
getNormTrie()575     public Trie2_16 getNormTrie() { return normTrie; }
576 
577     // Note: Normalizer2Impl.java r30983 (2011-nov-27)
578     // still had getFCDTrie() which built and cached an FCD trie.
579     // That provided faster access to FCD data than getFCD16FromNormData()
580     // but required synchronization and consumed some 10kB of heap memory
581     // in any process that uses FCD (e.g., via collation).
582     // tccc180[] and smallFCD[] are intended to help with any loss of performance,
583     // at least for Latin & CJK.
584 
585     /**
586      * Builds the canonical-iterator data for this instance.
587      * This is required before any of {@link #isCanonSegmentStarter(int)} or
588      * {@link #getCanonStartSet(int, UnicodeSet)} are called,
589      * or else they crash.
590      * @return this
591      */
ensureCanonIterData()592     public synchronized Normalizer2Impl ensureCanonIterData() {
593         if(canonIterData==null) {
594             Trie2Writable newData=new Trie2Writable(0, 0);
595             canonStartSets=new ArrayList<UnicodeSet>();
596             Iterator<Trie2.Range> trieIterator=normTrie.iterator();
597             Trie2.Range range;
598             while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
599                 final int norm16=range.value;
600                 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
601                     // Inert, or 2-way mapping (including Hangul syllable).
602                     // We do not write a canonStartSet for any yesNo character.
603                     // Composites from 2-way mappings are added at runtime from the
604                     // starter's compositions list, and the other characters in
605                     // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
606                     // "maybe" characters.
607                     continue;
608                 }
609                 for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) {
610                     final int oldValue=newData.get(c);
611                     int newValue=oldValue;
612                     if(norm16>=minMaybeYes) {
613                         // not a segment starter if it occurs in a decomposition or has cc!=0
614                         newValue|=CANON_NOT_SEGMENT_STARTER;
615                         if(norm16<MIN_NORMAL_MAYBE_YES) {
616                             newValue|=CANON_HAS_COMPOSITIONS;
617                         }
618                     } else if(norm16<minYesNo) {
619                         newValue|=CANON_HAS_COMPOSITIONS;
620                     } else {
621                         // c has a one-way decomposition
622                         int c2=c;
623                         int norm16_2=norm16;
624                         while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
625                             c2=this.mapAlgorithmic(c2, norm16_2);
626                             norm16_2=getNorm16(c2);
627                         }
628                         if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
629                             // c decomposes, get everything from the variable-length extra data
630                             int firstUnit=extraData.charAt(norm16_2);
631                             int length=firstUnit&MAPPING_LENGTH_MASK;
632                             if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
633                                 if(c==c2 && (extraData.charAt(norm16_2-1)&0xff)!=0) {
634                                     newValue|=CANON_NOT_SEGMENT_STARTER;  // original c has cc!=0
635                                 }
636                             }
637                             // Skip empty mappings (no characters in the decomposition).
638                             if(length!=0) {
639                                 ++norm16_2;  // skip over the firstUnit
640                                 // add c to first code point's start set
641                                 int limit=norm16_2+length;
642                                 c2=extraData.codePointAt(norm16_2);
643                                 addToStartSet(newData, c, c2);
644                                 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
645                                 // one-way mapping. A 2-way mapping is possible here after
646                                 // intermediate algorithmic mapping.
647                                 if(norm16_2>=minNoNo) {
648                                     while((norm16_2+=Character.charCount(c2))<limit) {
649                                         c2=extraData.codePointAt(norm16_2);
650                                         int c2Value=newData.get(c2);
651                                         if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
652                                             newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
653                                         }
654                                     }
655                                 }
656                             }
657                         } else {
658                             // c decomposed to c2 algorithmically; c has cc==0
659                             addToStartSet(newData, c, c2);
660                         }
661                     }
662                     if(newValue!=oldValue) {
663                         newData.set(c, newValue);
664                     }
665                 }
666             }
667             canonIterData=newData.toTrie2_32();
668         }
669         return this;
670     }
671 
getNorm16(int c)672     public int getNorm16(int c) { return normTrie.get(c); }
673 
getCompQuickCheck(int norm16)674     public int getCompQuickCheck(int norm16) {
675         if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
676             return 1;  // yes
677         } else if(minMaybeYes<=norm16) {
678             return 2;  // maybe
679         } else {
680             return 0;  // no
681         }
682     }
isAlgorithmicNoNo(int norm16)683     public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
isCompNo(int norm16)684     public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
isDecompYes(int norm16)685     public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
686 
getCC(int norm16)687     public int getCC(int norm16) {
688         if(norm16>=MIN_NORMAL_MAYBE_YES) {
689             return norm16&0xff;
690         }
691         if(norm16<minNoNo || limitNoNo<=norm16) {
692             return 0;
693         }
694         return getCCFromNoNo(norm16);
695     }
getCCFromYesOrMaybe(int norm16)696     public static int getCCFromYesOrMaybe(int norm16) {
697         return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0;
698     }
699 
700     /**
701      * Returns the FCD data for code point c.
702      * @param c A Unicode code point.
703      * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
704      */
getFCD16(int c)705     public int getFCD16(int c) {
706         if(c<0) {
707             return 0;
708         } else if(c<0x180) {
709             return tccc180[c];
710         } else if(c<=0xffff) {
711             if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
712         }
713         return getFCD16FromNormData(c);
714     }
715     /** Returns the FCD data for U+0000<=c<U+0180. */
getFCD16FromBelow180(int c)716     public int getFCD16FromBelow180(int c) { return tccc180[c]; }
717     /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */
singleLeadMightHaveNonZeroFCD16(int lead)718     public boolean singleLeadMightHaveNonZeroFCD16(int lead) {
719         // 0<=lead<=0xffff
720         byte bits=smallFCD[lead>>8];
721         if(bits==0) { return false; }
722         return ((bits>>((lead>>5)&7))&1)!=0;
723     }
724 
725     /** Gets the FCD value from the regular normalization data. */
getFCD16FromNormData(int c)726     public int getFCD16FromNormData(int c) {
727         // Only loops for 1:1 algorithmic mappings.
728         for(;;) {
729             int norm16=getNorm16(c);
730             if(norm16<=minYesNo) {
731                 // no decomposition or Hangul syllable, all zeros
732                 return 0;
733             } else if(norm16>=MIN_NORMAL_MAYBE_YES) {
734                 // combining mark
735                 norm16&=0xff;
736                 return norm16|(norm16<<8);
737             } else if(norm16>=minMaybeYes) {
738                 return 0;
739             } else if(isDecompNoAlgorithmic(norm16)) {
740                 c=mapAlgorithmic(c, norm16);
741             } else {
742                 // c decomposes, get everything from the variable-length extra data
743                 int firstUnit=extraData.charAt(norm16);
744                 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
745                     // A character that is deleted (maps to an empty string) must
746                     // get the worst-case lccc and tccc values because arbitrary
747                     // characters on both sides will become adjacent.
748                     return 0x1ff;
749                 } else {
750                     int fcd16=firstUnit>>8;  // tccc
751                     if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
752                         fcd16|=extraData.charAt(norm16-1)&0xff00;  // lccc
753                     }
754                     return fcd16;
755                 }
756             }
757         }
758     }
759 
760     /**
761      * Gets the decomposition for one code point.
762      * @param c code point
763      * @return c's decomposition, if it has one; returns null if it does not have a decomposition
764      */
getDecomposition(int c)765     public String getDecomposition(int c) {
766         int decomp=-1;
767         int norm16;
768         for(;;) {
769             if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
770                 // c does not decompose
771             } else if(isHangul(norm16)) {
772                 // Hangul syllable: decompose algorithmically
773                 StringBuilder buffer=new StringBuilder();
774                 Hangul.decompose(c, buffer);
775                 return buffer.toString();
776             } else if(isDecompNoAlgorithmic(norm16)) {
777                 decomp=c=mapAlgorithmic(c, norm16);
778                 continue;
779             } else {
780                 // c decomposes, get everything from the variable-length extra data
781                 int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK;
782                 return extraData.substring(norm16, norm16+length);
783             }
784             if(decomp<0) {
785                 return null;
786             } else {
787                 return UTF16.valueOf(decomp);
788             }
789         }
790     }
791 
792     /**
793      * Gets the raw decomposition for one code point.
794      * @param c code point
795      * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition
796      */
getRawDecomposition(int c)797     public String getRawDecomposition(int c) {
798         // We do not loop in this method because an algorithmic mapping itself
799         // becomes a final result rather than having to be decomposed recursively.
800         int norm16;
801         if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) {
802             // c does not decompose
803             return null;
804         } else if(isHangul(norm16)) {
805             // Hangul syllable: decompose algorithmically
806             StringBuilder buffer=new StringBuilder();
807             Hangul.getRawDecomposition(c, buffer);
808             return buffer.toString();
809         } else if(isDecompNoAlgorithmic(norm16)) {
810             return UTF16.valueOf(mapAlgorithmic(c, norm16));
811         } else {
812             // c decomposes, get everything from the variable-length extra data
813             int firstUnit=extraData.charAt(norm16);
814             int mLength=firstUnit&MAPPING_LENGTH_MASK;  // length of normal mapping
815             if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) {
816                 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word.
817                 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD
818                 int rawMapping=norm16-((firstUnit>>7)&1)-1;
819                 char rm0=extraData.charAt(rawMapping);
820                 if(rm0<=MAPPING_LENGTH_MASK) {
821                     return extraData.substring(rawMapping-rm0, rawMapping);
822                 } else {
823                     // Copy the normal mapping and replace its first two code units with rm0.
824                     StringBuilder buffer=new StringBuilder(mLength-1).append(rm0);
825                     norm16+=1+2;  // skip over the firstUnit and the first two mapping code units
826                     return buffer.append(extraData, norm16, norm16+mLength-2).toString();
827                 }
828             } else {
829                 norm16+=1;  // skip over the firstUnit
830                 return extraData.substring(norm16, norm16+mLength);
831             }
832         }
833     }
834 
835     /**
836      * Returns true if code point c starts a canonical-iterator string segment.
837      * <b>{@link #ensureCanonIterData()} must have been called before this method,
838      * or else this method will crash.</b>
839      * @param c A Unicode code point.
840      * @return true if c starts a canonical-iterator string segment.
841      */
isCanonSegmentStarter(int c)842     public boolean isCanonSegmentStarter(int c) {
843         return canonIterData.get(c)>=0;
844     }
845     /**
846      * Returns true if there are characters whose decomposition starts with c.
847      * If so, then the set is cleared and then filled with those characters.
848      * <b>{@link #ensureCanonIterData()} must have been called before this method,
849      * or else this method will crash.</b>
850      * @param c A Unicode code point.
851      * @param set A UnicodeSet to receive the characters whose decompositions
852      *        start with c, if there are any.
853      * @return true if there are characters whose decomposition starts with c.
854      */
getCanonStartSet(int c, UnicodeSet set)855     public boolean getCanonStartSet(int c, UnicodeSet set) {
856         int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
857         if(canonValue==0) {
858             return false;
859         }
860         set.clear();
861         int value=canonValue&CANON_VALUE_MASK;
862         if((canonValue&CANON_HAS_SET)!=0) {
863             set.addAll(canonStartSets.get(value));
864         } else if(value!=0) {
865             set.add(value);
866         }
867         if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
868             int norm16=getNorm16(c);
869             if(norm16==JAMO_L) {
870                 int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
871                 set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
872             } else {
873                 addComposites(getCompositionsList(norm16), set);
874             }
875         }
876         return true;
877     }
878 
879     public static final int MIN_CCC_LCCC_CP=0x300;
880 
881     public static final int MIN_YES_YES_WITH_CC=0xff01;
882     public static final int JAMO_VT=0xff00;
883     public static final int MIN_NORMAL_MAYBE_YES=0xfe00;
884     public static final int JAMO_L=1;
885     public static final int MAX_DELTA=0x40;
886 
887     // Byte offsets from the start of the data, after the generic header.
888     public static final int IX_NORM_TRIE_OFFSET=0;
889     public static final int IX_EXTRA_DATA_OFFSET=1;
890     public static final int IX_SMALL_FCD_OFFSET=2;
891     public static final int IX_RESERVED3_OFFSET=3;
892     public static final int IX_TOTAL_SIZE=7;
893 
894     // Code point thresholds for quick check codes.
895     public static final int IX_MIN_DECOMP_NO_CP=8;
896     public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
897 
898     // Norm16 value thresholds for quick check combinations and types of extra data.
899     // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
900     public static final int IX_MIN_YES_NO=10;
901     public static final int IX_MIN_NO_NO=11;
902     public static final int IX_LIMIT_NO_NO=12;
903     public static final int IX_MIN_MAYBE_YES=13;
904 
905     // Mappings only in [minYesNoMappingsOnly..minNoNo[.
906     public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
907 
908     public static final int IX_COUNT=16;
909 
910     public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80;
911     public static final int MAPPING_HAS_RAW_MAPPING=0x40;
912     public static final int MAPPING_NO_COMP_BOUNDARY_AFTER=0x20;
913     public static final int MAPPING_LENGTH_MASK=0x1f;
914 
915     public static final int COMP_1_LAST_TUPLE=0x8000;
916     public static final int COMP_1_TRIPLE=1;
917     public static final int COMP_1_TRAIL_LIMIT=0x3400;
918     public static final int COMP_1_TRAIL_MASK=0x7ffe;
919     public static final int COMP_1_TRAIL_SHIFT=9;  // 10-1 for the "triple" bit
920     public static final int COMP_2_TRAIL_SHIFT=6;
921     public static final int COMP_2_TRAIL_MASK=0xffc0;
922 
923     // higher-level functionality ------------------------------------------ ***
924 
925     // NFD without an NFD Normalizer2 instance.
decompose(CharSequence s, StringBuilder dest)926     public Appendable decompose(CharSequence s, StringBuilder dest) {
927         decompose(s, 0, s.length(), dest, s.length());
928         return dest;
929     }
930     /**
931      * Decomposes s[src, limit[ and writes the result to dest.
932      * limit can be NULL if src is NUL-terminated.
933      * destLengthEstimate is the initial dest buffer capacity and can be -1.
934      */
decompose(CharSequence s, int src, int limit, StringBuilder dest, int destLengthEstimate)935     public void decompose(CharSequence s, int src, int limit, StringBuilder dest,
936                    int destLengthEstimate) {
937         if(destLengthEstimate<0) {
938             destLengthEstimate=limit-src;
939         }
940         dest.setLength(0);
941         ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate);
942         decompose(s, src, limit, buffer);
943     }
944 
945     // Dual functionality:
946     // buffer!=NULL: normalize
947     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
decompose(CharSequence s, int src, int limit, ReorderingBuffer buffer)948     public int decompose(CharSequence s, int src, int limit,
949                          ReorderingBuffer buffer) {
950         int minNoCP=minDecompNoCP;
951 
952         int prevSrc;
953         int c=0;
954         int norm16=0;
955 
956         // only for quick check
957         int prevBoundary=src;
958         int prevCC=0;
959 
960         for(;;) {
961             // count code units below the minimum or with irrelevant data for the quick check
962             for(prevSrc=src; src!=limit;) {
963                 if( (c=s.charAt(src))<minNoCP ||
964                     isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
965                 ) {
966                     ++src;
967                 } else if(!UTF16.isSurrogate((char)c)) {
968                     break;
969                 } else {
970                     char c2;
971                     if(UTF16Plus.isSurrogateLead(c)) {
972                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
973                             c=Character.toCodePoint((char)c, c2);
974                         }
975                     } else /* trail surrogate */ {
976                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
977                             --src;
978                             c=Character.toCodePoint(c2, (char)c);
979                         }
980                     }
981                     if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
982                         src+=Character.charCount(c);
983                     } else {
984                         break;
985                     }
986                 }
987             }
988             // copy these code units all at once
989             if(src!=prevSrc) {
990                 if(buffer!=null) {
991                     buffer.flushAndAppendZeroCC(s, prevSrc, src);
992                 } else {
993                     prevCC=0;
994                     prevBoundary=src;
995                 }
996             }
997             if(src==limit) {
998                 break;
999             }
1000 
1001             // Check one above-minimum, relevant code point.
1002             src+=Character.charCount(c);
1003             if(buffer!=null) {
1004                 decompose(c, norm16, buffer);
1005             } else {
1006                 if(isDecompYes(norm16)) {
1007                     int cc=getCCFromYesOrMaybe(norm16);
1008                     if(prevCC<=cc || cc==0) {
1009                         prevCC=cc;
1010                         if(cc<=1) {
1011                             prevBoundary=src;
1012                         }
1013                         continue;
1014                     }
1015                 }
1016                 return prevBoundary;  // "no" or cc out of order
1017             }
1018         }
1019         return src;
1020     }
decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer)1021     public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) {
1022         int limit=s.length();
1023         if(limit==0) {
1024             return;
1025         }
1026         if(doDecompose) {
1027             decompose(s, 0, limit, buffer);
1028             return;
1029         }
1030         // Just merge the strings at the boundary.
1031         int c=Character.codePointAt(s, 0);
1032         int src=0;
1033         int firstCC, prevCC, cc;
1034         firstCC=prevCC=cc=getCC(getNorm16(c));
1035         while(cc!=0) {
1036             prevCC=cc;
1037             src+=Character.charCount(c);
1038             if(src>=limit) {
1039                 break;
1040             }
1041             c=Character.codePointAt(s, src);
1042             cc=getCC(getNorm16(c));
1043         };
1044         buffer.append(s, 0, src, firstCC, prevCC);
1045         buffer.append(s, src, limit);
1046     }
1047     // Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
1048     // doCompose: normalize
1049     // !doCompose: isNormalized (buffer must be empty and initialized)
compose(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doCompose, ReorderingBuffer buffer)1050     public boolean compose(CharSequence s, int src, int limit,
1051                            boolean onlyContiguous,
1052                            boolean doCompose,
1053                            ReorderingBuffer buffer) {
1054         int minNoMaybeCP=minCompNoMaybeCP;
1055 
1056         /*
1057          * prevBoundary points to the last character before the current one
1058          * that has a composition boundary before it with ccc==0 and quick check "yes".
1059          * Keeping track of prevBoundary saves us looking for a composition boundary
1060          * when we find a "no" or "maybe".
1061          *
1062          * When we back out from prevSrc back to prevBoundary,
1063          * then we also remove those same characters (which had been simply copied
1064          * or canonically-order-inserted) from the ReorderingBuffer.
1065          * Therefore, at all times, the [prevBoundary..prevSrc[ source units
1066          * must correspond 1:1 to destination units at the end of the destination buffer.
1067          */
1068         int prevBoundary=src;
1069         int prevSrc;
1070         int c=0;
1071         int norm16=0;
1072 
1073         // only for isNormalized
1074         int prevCC=0;
1075 
1076         for(;;) {
1077             // count code units below the minimum or with irrelevant data for the quick check
1078             for(prevSrc=src; src!=limit;) {
1079                 if( (c=s.charAt(src))<minNoMaybeCP ||
1080                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
1081                 ) {
1082                     ++src;
1083                 } else if(!UTF16.isSurrogate((char)c)) {
1084                     break;
1085                 } else {
1086                     char c2;
1087                     if(UTF16Plus.isSurrogateLead(c)) {
1088                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1089                             c=Character.toCodePoint((char)c, c2);
1090                         }
1091                     } else /* trail surrogate */ {
1092                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1093                             --src;
1094                             c=Character.toCodePoint(c2, (char)c);
1095                         }
1096                     }
1097                     if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1098                         src+=Character.charCount(c);
1099                     } else {
1100                         break;
1101                     }
1102                 }
1103             }
1104             // copy these code units all at once
1105             if(src!=prevSrc) {
1106                 if(src==limit) {
1107                     if(doCompose) {
1108                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
1109                     }
1110                     break;
1111                 }
1112                 // Set prevBoundary to the last character in the quick check loop.
1113                 prevBoundary=src-1;
1114                 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
1115                     Character.isHighSurrogate(s.charAt(prevBoundary-1))
1116                 ) {
1117                     --prevBoundary;
1118                 }
1119                 if(doCompose) {
1120                     // The last "quick check yes" character is excluded from the
1121                     // flush-and-append call in case it needs to be modified.
1122                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1123                     buffer.append(s, prevBoundary, src);
1124                 } else {
1125                     prevCC=0;
1126                 }
1127                 // The start of the current character (c).
1128                 prevSrc=src;
1129             } else if(src==limit) {
1130                 break;
1131             }
1132 
1133             src+=Character.charCount(c);
1134             /*
1135              * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1136              * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1137              * or has ccc!=0.
1138              * Check for Jamo V/T, then for regular characters.
1139              * c is not a Hangul syllable or Jamo L because those have "yes" properties.
1140              */
1141             if(isJamoVT(norm16) && prevBoundary!=prevSrc) {
1142                 char prev=s.charAt(prevSrc-1);
1143                 boolean needToDecompose=false;
1144                 if(c<Hangul.JAMO_T_BASE) {
1145                     // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1146                     prev-=Hangul.JAMO_L_BASE;
1147                     if(prev<Hangul.JAMO_L_COUNT) {
1148                         if(!doCompose) {
1149                             return false;
1150                         }
1151                         char syllable=(char)
1152                             (Hangul.HANGUL_BASE+
1153                              (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1154                              Hangul.JAMO_T_COUNT);
1155                         char t;
1156                         if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1157                             ++src;
1158                             syllable+=t;  // The next character was a Jamo T.
1159                             prevBoundary=src;
1160                             buffer.setLastChar(syllable);
1161                             continue;
1162                         }
1163                         // If we see L+V+x where x!=T then we drop to the slow path,
1164                         // decompose and recompose.
1165                         // This is to deal with NFKC finding normal L and V but a
1166                         // compatibility variant of a T. We need to either fully compose that
1167                         // combination here (which would complicate the code and may not work
1168                         // with strange custom data) or use the slow path -- or else our replacing
1169                         // two input characters (L+V) with one output character (LV syllable)
1170                         // would violate the invariant that [prevBoundary..prevSrc[ has the same
1171                         // length as what we appended to the buffer since prevBoundary.
1172                         needToDecompose=true;
1173                     }
1174                 } else if(Hangul.isHangulWithoutJamoT(prev)) {
1175                     // c is a Jamo Trailing consonant,
1176                     // compose with previous Hangul LV that does not contain a Jamo T.
1177                     if(!doCompose) {
1178                         return false;
1179                     }
1180                     buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE));
1181                     prevBoundary=src;
1182                     continue;
1183                 }
1184                 if(!needToDecompose) {
1185                     // The Jamo V/T did not compose into a Hangul syllable.
1186                     if(doCompose) {
1187                         buffer.append((char)c);
1188                     } else {
1189                         prevCC=0;
1190                     }
1191                     continue;
1192                 }
1193             }
1194             /*
1195              * Source buffer pointers:
1196              *
1197              *  all done      quick check   current char  not yet
1198              *                "yes" but     (c)           processed
1199              *                may combine
1200              *                forward
1201              * [-------------[-------------[-------------[-------------[
1202              * |             |             |             |             |
1203              * orig. src     prevBoundary  prevSrc       src           limit
1204              *
1205              *
1206              * Destination buffer pointers inside the ReorderingBuffer:
1207              *
1208              *  all done      might take    not filled yet
1209              *                characters for
1210              *                reordering
1211              * [-------------[-------------[-------------[
1212              * |             |             |             |
1213              * start         reorderStart  limit         |
1214              *                             +remainingCap.+
1215              */
1216             if(norm16>=MIN_YES_YES_WITH_CC) {
1217                 int cc=norm16&0xff;  // cc!=0
1218                 if( onlyContiguous &&  // FCC
1219                     (doCompose ? buffer.getLastCC() : prevCC)==0 &&
1220                     prevBoundary<prevSrc &&
1221                     // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that
1222                     // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1223                     // passed the quick check "yes && ccc==0" test.
1224                     // Check whether the last character was a "yesYes" or a "yesNo".
1225                     // If a "yesNo", then we get its trailing ccc from its
1226                     // mapping and check for canonical order.
1227                     // All other cases are ok.
1228                     getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
1229                 ) {
1230                     // Fails FCD test, need to decompose and contiguously recompose.
1231                     if(!doCompose) {
1232                         return false;
1233                     }
1234                 } else if(doCompose) {
1235                     buffer.append(c, cc);
1236                     continue;
1237                 } else if(prevCC<=cc) {
1238                     prevCC=cc;
1239                     continue;
1240                 } else {
1241                     return false;
1242                 }
1243             } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) {
1244                 return false;
1245             }
1246 
1247             /*
1248              * Find appropriate boundaries around this character,
1249              * decompose the source text from between the boundaries,
1250              * and recompose it.
1251              *
1252              * We may need to remove the last few characters from the ReorderingBuffer
1253              * to account for source text that was copied or appended
1254              * but needs to take part in the recomposition.
1255              */
1256 
1257             /*
1258              * Find the last composition boundary in [prevBoundary..src[.
1259              * It is either the decomposition of the current character (at prevSrc),
1260              * or prevBoundary.
1261              */
1262             if(hasCompBoundaryBefore(c, norm16)) {
1263                 prevBoundary=prevSrc;
1264             } else if(doCompose) {
1265                 buffer.removeSuffix(prevSrc-prevBoundary);
1266             }
1267 
1268             // Find the next composition boundary in [src..limit[ -
1269             // modifies src to point to the next starter.
1270             src=findNextCompBoundary(s, src, limit);
1271 
1272             // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it.
1273             int recomposeStartIndex=buffer.length();
1274             decomposeShort(s, prevBoundary, src, buffer);
1275             recompose(buffer, recomposeStartIndex, onlyContiguous);
1276             if(!doCompose) {
1277                 if(!buffer.equals(s, prevBoundary, src)) {
1278                     return false;
1279                 }
1280                 buffer.remove();
1281                 prevCC=0;
1282             }
1283 
1284             // Move to the next starter. We never need to look back before this point again.
1285             prevBoundary=src;
1286         }
1287         return true;
1288     }
1289     /**
1290      * Very similar to compose(): Make the same changes in both places if relevant.
1291      * doSpan: spanQuickCheckYes (ignore bit 0 of the return value)
1292      * !doSpan: quickCheck
1293      * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and
1294      *         bit 0: set if "maybe"; otherwise, if the span length&lt;s.length()
1295      *         then the quick check result is "no"
1296      */
composeQuickCheck(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doSpan)1297     public int composeQuickCheck(CharSequence s, int src, int limit,
1298                                  boolean onlyContiguous, boolean doSpan) {
1299         int qcResult=0;
1300         int minNoMaybeCP=minCompNoMaybeCP;
1301 
1302         /*
1303          * prevBoundary points to the last character before the current one
1304          * that has a composition boundary before it with ccc==0 and quick check "yes".
1305          */
1306         int prevBoundary=src;
1307         int prevSrc;
1308         int c=0;
1309         int norm16=0;
1310         int prevCC=0;
1311 
1312         for(;;) {
1313             // count code units below the minimum or with irrelevant data for the quick check
1314             for(prevSrc=src;;) {
1315                 if(src==limit) {
1316                     return (src<<1)|qcResult;  // "yes" or "maybe"
1317                 }
1318                 if( (c=s.charAt(src))<minNoMaybeCP ||
1319                     isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
1320                 ) {
1321                     ++src;
1322                 } else if(!UTF16.isSurrogate((char)c)) {
1323                     break;
1324                 } else {
1325                     char c2;
1326                     if(UTF16Plus.isSurrogateLead(c)) {
1327                         if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1328                             c=Character.toCodePoint((char)c, c2);
1329                         }
1330                     } else /* trail surrogate */ {
1331                         if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1332                             --src;
1333                             c=Character.toCodePoint(c2, (char)c);
1334                         }
1335                     }
1336                     if(isCompYesAndZeroCC(norm16=getNorm16(c))) {
1337                         src+=Character.charCount(c);
1338                     } else {
1339                         break;
1340                     }
1341                 }
1342             }
1343             if(src!=prevSrc) {
1344                 // Set prevBoundary to the last character in the quick check loop.
1345                 prevBoundary=src-1;
1346                 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary &&
1347                         Character.isHighSurrogate(s.charAt(prevBoundary-1))
1348                 ) {
1349                     --prevBoundary;
1350                 }
1351                 prevCC=0;
1352                 // The start of the current character (c).
1353                 prevSrc=src;
1354             }
1355 
1356             src+=Character.charCount(c);
1357             /*
1358              * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo.
1359              * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward)
1360              * or has ccc!=0.
1361              */
1362             if(isMaybeOrNonZeroCC(norm16)) {
1363                 int cc=getCCFromYesOrMaybe(norm16);
1364                 if( onlyContiguous &&  // FCC
1365                     cc!=0 &&
1366                     prevCC==0 &&
1367                     prevBoundary<prevSrc &&
1368                     // prevCC==0 && prevBoundary<prevSrc tell us that
1369                     // [prevBoundary..prevSrc[ (which is exactly one character under these conditions)
1370                     // passed the quick check "yes && ccc==0" test.
1371                     // Check whether the last character was a "yesYes" or a "yesNo".
1372                     // If a "yesNo", then we get its trailing ccc from its
1373                     // mapping and check for canonical order.
1374                     // All other cases are ok.
1375                     getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc
1376                 ) {
1377                     // Fails FCD test.
1378                 } else if(prevCC<=cc || cc==0) {
1379                     prevCC=cc;
1380                     if(norm16<MIN_YES_YES_WITH_CC) {
1381                         if(!doSpan) {
1382                             qcResult=1;
1383                         } else {
1384                             return prevBoundary<<1;  // spanYes does not care to know it's "maybe"
1385                         }
1386                     }
1387                     continue;
1388                 }
1389             }
1390             return prevBoundary<<1;  // "no"
1391         }
1392     }
composeAndAppend(CharSequence s, boolean doCompose, boolean onlyContiguous, ReorderingBuffer buffer)1393     public void composeAndAppend(CharSequence s,
1394                                  boolean doCompose,
1395                                  boolean onlyContiguous,
1396                                  ReorderingBuffer buffer) {
1397         int src=0, limit=s.length();
1398         if(!buffer.isEmpty()) {
1399             int firstStarterInSrc=findNextCompBoundary(s, 0, limit);
1400             if(0!=firstStarterInSrc) {
1401                 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(),
1402                                                                buffer.length());
1403                 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+
1404                                                        firstStarterInSrc+16);
1405                 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length());
1406                 buffer.removeSuffix(buffer.length()-lastStarterInDest);
1407                 middle.append(s, 0, firstStarterInSrc);
1408                 compose(middle, 0, middle.length(), onlyContiguous, true, buffer);
1409                 src=firstStarterInSrc;
1410             }
1411         }
1412         if(doCompose) {
1413             compose(s, src, limit, onlyContiguous, true, buffer);
1414         } else {
1415             buffer.append(s, src, limit);
1416         }
1417     }
1418     // Dual functionality:
1419     // buffer!=NULL: normalize
1420     // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes
makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer)1421     public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) {
1422         // Note: In this function we use buffer->appendZeroCC() because we track
1423         // the lead and trail combining classes here, rather than leaving it to
1424         // the ReorderingBuffer.
1425         // The exception is the call to decomposeShort() which uses the buffer
1426         // in the normal way.
1427 
1428         // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1.
1429         // Similar to the prevBoundary in the compose() implementation.
1430         int prevBoundary=src;
1431         int prevSrc;
1432         int c=0;
1433         int prevFCD16=0;
1434         int fcd16=0;
1435 
1436         for(;;) {
1437             // count code units with lccc==0
1438             for(prevSrc=src; src!=limit;) {
1439                 if((c=s.charAt(src))<MIN_CCC_LCCC_CP) {
1440                     prevFCD16=~c;
1441                     ++src;
1442                 } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
1443                     prevFCD16=0;
1444                     ++src;
1445                 } else {
1446                     if(UTF16.isSurrogate((char)c)) {
1447                         char c2;
1448                         if(UTF16Plus.isSurrogateLead(c)) {
1449                             if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
1450                                 c=Character.toCodePoint((char)c, c2);
1451                             }
1452                         } else /* trail surrogate */ {
1453                             if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
1454                                 --src;
1455                                 c=Character.toCodePoint(c2, (char)c);
1456                             }
1457                         }
1458                     }
1459                     if((fcd16=getFCD16FromNormData(c))<=0xff) {
1460                         prevFCD16=fcd16;
1461                         src+=Character.charCount(c);
1462                     } else {
1463                         break;
1464                     }
1465                 }
1466             }
1467             // copy these code units all at once
1468             if(src!=prevSrc) {
1469                 if(src==limit) {
1470                     if(buffer!=null) {
1471                         buffer.flushAndAppendZeroCC(s, prevSrc, src);
1472                     }
1473                     break;
1474                 }
1475                 prevBoundary=src;
1476                 // We know that the previous character's lccc==0.
1477                 if(prevFCD16<0) {
1478                     // Fetching the fcd16 value was deferred for this below-U+0300 code point.
1479                     int prev=~prevFCD16;
1480                     prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev);
1481                     if(prevFCD16>1) {
1482                         --prevBoundary;
1483                     }
1484                 } else {
1485                     int p=src-1;
1486                     if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p &&
1487                         Character.isHighSurrogate(s.charAt(p-1))
1488                     ) {
1489                         --p;
1490                         // Need to fetch the previous character's FCD value because
1491                         // prevFCD16 was just for the trail surrogate code point.
1492                         prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1)));
1493                         // Still known to have lccc==0 because its lead surrogate unit had lccc==0.
1494                     }
1495                     if(prevFCD16>1) {
1496                         prevBoundary=p;
1497                     }
1498                 }
1499                 if(buffer!=null) {
1500                     // The last lccc==0 character is excluded from the
1501                     // flush-and-append call in case it needs to be modified.
1502                     buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary);
1503                     buffer.append(s, prevBoundary, src);
1504                 }
1505                 // The start of the current character (c).
1506                 prevSrc=src;
1507             } else if(src==limit) {
1508                 break;
1509             }
1510 
1511             src+=Character.charCount(c);
1512             // The current character (c) at [prevSrc..src[ has a non-zero lead combining class.
1513             // Check for proper order, and decompose locally if necessary.
1514             if((prevFCD16&0xff)<=(fcd16>>8)) {
1515                 // proper order: prev tccc <= current lccc
1516                 if((fcd16&0xff)<=1) {
1517                     prevBoundary=src;
1518                 }
1519                 if(buffer!=null) {
1520                     buffer.appendZeroCC(c);
1521                 }
1522                 prevFCD16=fcd16;
1523                 continue;
1524             } else if(buffer==null) {
1525                 return prevBoundary;  // quick check "no"
1526             } else {
1527                 /*
1528                  * Back out the part of the source that we copied or appended
1529                  * already but is now going to be decomposed.
1530                  * prevSrc is set to after what was copied/appended.
1531                  */
1532                 buffer.removeSuffix(prevSrc-prevBoundary);
1533                 /*
1534                  * Find the part of the source that needs to be decomposed,
1535                  * up to the next safe boundary.
1536                  */
1537                 src=findNextFCDBoundary(s, src, limit);
1538                 /*
1539                  * The source text does not fulfill the conditions for FCD.
1540                  * Decompose and reorder a limited piece of the text.
1541                  */
1542                 decomposeShort(s, prevBoundary, src, buffer);
1543                 prevBoundary=src;
1544                 prevFCD16=0;
1545             }
1546         }
1547         return src;
1548     }
makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer)1549     public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) {
1550         int src=0, limit=s.length();
1551         if(!buffer.isEmpty()) {
1552             int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit);
1553             if(0!=firstBoundaryInSrc) {
1554                 int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(),
1555                                                                buffer.length());
1556                 StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+
1557                                                        firstBoundaryInSrc+16);
1558                 middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length());
1559                 buffer.removeSuffix(buffer.length()-lastBoundaryInDest);
1560                 middle.append(s, 0, firstBoundaryInSrc);
1561                 makeFCD(middle, 0, middle.length(), buffer);
1562                 src=firstBoundaryInSrc;
1563             }
1564         }
1565         if(doMakeFCD) {
1566             makeFCD(s, src, limit, buffer);
1567         } else {
1568             buffer.append(s, src, limit);
1569         }
1570     }
1571 
1572     // Note: hasDecompBoundary() could be implemented as aliases to
1573     // hasFCDBoundaryBefore() and hasFCDBoundaryAfter()
1574     // at the cost of building the FCD trie for a decomposition normalizer.
hasDecompBoundary(int c, boolean before)1575     public boolean hasDecompBoundary(int c, boolean before) {
1576         for(;;) {
1577             if(c<minDecompNoCP) {
1578                 return true;
1579             }
1580             int norm16=getNorm16(c);
1581             if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) {
1582                 return true;
1583             } else if(norm16>MIN_NORMAL_MAYBE_YES) {
1584                 return false;  // ccc!=0
1585             } else if(isDecompNoAlgorithmic(norm16)) {
1586                 c=mapAlgorithmic(c, norm16);
1587             } else {
1588                 // c decomposes, get everything from the variable-length extra data
1589                 int firstUnit=extraData.charAt(norm16);
1590                 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
1591                     return false;
1592                 }
1593                 if(!before) {
1594                     // decomp after-boundary: same as hasFCDBoundaryAfter(),
1595                     // fcd16<=1 || trailCC==0
1596                     if(firstUnit>0x1ff) {
1597                         return false;  // trailCC>1
1598                     }
1599                     if(firstUnit<=0xff) {
1600                         return true;  // trailCC==0
1601                     }
1602                     // if(trailCC==1) test leadCC==0, same as checking for before-boundary
1603                 }
1604                 // true if leadCC==0 (hasFCDBoundaryBefore())
1605                 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0;
1606             }
1607         }
1608     }
isDecompInert(int c)1609     public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); }
1610 
hasCompBoundaryBefore(int c)1611     public boolean hasCompBoundaryBefore(int c) {
1612         return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
1613     }
hasCompBoundaryAfter(int c, boolean onlyContiguous, boolean testInert)1614     public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous, boolean testInert) {
1615         for(;;) {
1616             int norm16=getNorm16(c);
1617             if(isInert(norm16)) {
1618                 return true;
1619             } else if(norm16<=minYesNo) {
1620                 // Hangul: norm16==minYesNo
1621                 // Hangul LVT has a boundary after it.
1622                 // Hangul LV and non-inert yesYes characters combine forward.
1623                 return isHangul(norm16) && !Hangul.isHangulWithoutJamoT((char)c);
1624             } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) {
1625                 return false;
1626             } else if(isDecompNoAlgorithmic(norm16)) {
1627                 c=mapAlgorithmic(c, norm16);
1628             } else {
1629                 // c decomposes, get everything from the variable-length extra data.
1630                 // If testInert, then c must be a yesNo character which has lccc=0,
1631                 // otherwise it could be a noNo.
1632                 int firstUnit=extraData.charAt(norm16);
1633                 // true if
1634                 //   not MAPPING_NO_COMP_BOUNDARY_AFTER
1635                 //     (which is set if
1636                 //       c is not deleted, and
1637                 //       it and its decomposition do not combine forward, and it has a starter)
1638                 //   and if FCC then trailCC<=1
1639                 return
1640                     (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 &&
1641                     (!onlyContiguous || firstUnit<=0x1ff);
1642             }
1643         }
1644     }
1645 
hasFCDBoundaryBefore(int c)1646     public boolean hasFCDBoundaryBefore(int c) { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; }
hasFCDBoundaryAfter(int c)1647     public boolean hasFCDBoundaryAfter(int c) {
1648         int fcd16=getFCD16(c);
1649         return fcd16<=1 || (fcd16&0xff)==0;
1650     }
isFCDInert(int c)1651     public boolean isFCDInert(int c) { return getFCD16(c)<=1; }
1652 
isMaybe(int norm16)1653     private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
isMaybeOrNonZeroCC(int norm16)1654     private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; }
isInert(int norm16)1655     private static boolean isInert(int norm16) { return norm16==0; }
isJamoL(int norm16)1656     private static boolean isJamoL(int norm16) { return norm16==1; }
isJamoVT(int norm16)1657     private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; }
isHangul(int norm16)1658     private boolean isHangul(int norm16) { return norm16==minYesNo; }
isCompYesAndZeroCC(int norm16)1659     private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; }
1660     // UBool isCompYes(uint16_t norm16) const {
1661     //     return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
1662     // }
1663     // UBool isCompYesOrMaybe(uint16_t norm16) const {
1664     //     return norm16<minNoNo || minMaybeYes<=norm16;
1665     // }
1666     // private boolean hasZeroCCFromDecompYes(int norm16) {
1667     //     return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1668     // }
isDecompYesAndZeroCC(int norm16)1669     private boolean isDecompYesAndZeroCC(int norm16) {
1670         return norm16<minYesNo ||
1671                norm16==JAMO_VT ||
1672                (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
1673     }
1674     /**
1675      * A little faster and simpler than isDecompYesAndZeroCC() but does not include
1676      * the MaybeYes which combine-forward and have ccc=0.
1677      * (Standard Unicode 5.2 normalization does not have such characters.)
1678      */
isMostDecompYesAndZeroCC(int norm16)1679     private boolean isMostDecompYesAndZeroCC(int norm16) {
1680         return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
1681     }
isDecompNoAlgorithmic(int norm16)1682     private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; }
1683 
1684     // For use with isCompYes().
1685     // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
1686     // static uint8_t getCCFromYes(uint16_t norm16) {
1687     //     return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
1688     // }
getCCFromNoNo(int norm16)1689     private int getCCFromNoNo(int norm16) {
1690         if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1691             return extraData.charAt(norm16-1)&0xff;
1692         } else {
1693             return 0;
1694         }
1695     }
1696     // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit)1697     int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) {
1698         int c;
1699         if(cpStart==(cpLimit-1)) {
1700             c=s.charAt(cpStart);
1701         } else {
1702             c=Character.codePointAt(s, cpStart);
1703         }
1704         int prevNorm16=getNorm16(c);
1705         if(prevNorm16<=minYesNo) {
1706             return 0;  // yesYes and Hangul LV/LVT have ccc=tccc=0
1707         } else {
1708             return extraData.charAt(prevNorm16)>>8;  // tccc from yesNo
1709         }
1710     }
1711 
1712     // Requires algorithmic-NoNo.
mapAlgorithmic(int c, int norm16)1713     private int mapAlgorithmic(int c, int norm16) {
1714         return c+norm16-(minMaybeYes-MAX_DELTA-1);
1715     }
1716 
1717     // Requires minYesNo<norm16<limitNoNo.
1718     // private int getMapping(int norm16) { return /*extraData+*/norm16; }
1719 
1720     /**
1721      * @return index into maybeYesCompositions, or -1
1722      */
getCompositionsListForDecompYes(int norm16)1723     private int getCompositionsListForDecompYes(int norm16) {
1724         if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
1725             return -1;
1726         } else {
1727             if((norm16-=minMaybeYes)<0) {
1728                 // norm16<minMaybeYes: index into extraData which is a substring at
1729                 //     maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes]
1730                 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16
1731                 norm16+=MIN_NORMAL_MAYBE_YES;  // for yesYes; if Jamo L: harmless empty list
1732             }
1733             return norm16;
1734         }
1735     }
1736     /**
1737      * @return index into maybeYesCompositions
1738      */
getCompositionsListForComposite(int norm16)1739     private int getCompositionsListForComposite(int norm16) {
1740         // composite has both mapping & compositions list
1741         int firstUnit=extraData.charAt(norm16);
1742         return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+  // mapping in maybeYesCompositions
1743             1+  // +1 to skip the first unit with the mapping lenth
1744             (firstUnit&MAPPING_LENGTH_MASK);  // + mapping length
1745     }
1746     /**
1747      * @param c code point must have compositions
1748      * @return index into maybeYesCompositions
1749      */
getCompositionsList(int norm16)1750     private int getCompositionsList(int norm16) {
1751         return isDecompYes(norm16) ?
1752                 getCompositionsListForDecompYes(norm16) :
1753                 getCompositionsListForComposite(norm16);
1754     }
1755 
1756     // Decompose a short piece of text which is likely to contain characters that
1757     // fail the quick check loop and/or where the quick check loop's overhead
1758     // is unlikely to be amortized.
1759     // Called by the compose() and makeFCD() implementations.
1760     // Public in Java for collation implementation code.
decomposeShort(CharSequence s, int src, int limit, ReorderingBuffer buffer)1761     public void decomposeShort(CharSequence s, int src, int limit,
1762                                ReorderingBuffer buffer) {
1763         while(src<limit) {
1764             int c=Character.codePointAt(s, src);
1765             src+=Character.charCount(c);
1766             decompose(c, getNorm16(c), buffer);
1767         }
1768     }
decompose(int c, int norm16, ReorderingBuffer buffer)1769     private void decompose(int c, int norm16,
1770                            ReorderingBuffer buffer) {
1771         // Only loops for 1:1 algorithmic mappings.
1772         for(;;) {
1773             // get the decomposition and the lead and trail cc's
1774             if(isDecompYes(norm16)) {
1775                 // c does not decompose
1776                 buffer.append(c, getCCFromYesOrMaybe(norm16));
1777             } else if(isHangul(norm16)) {
1778                 // Hangul syllable: decompose algorithmically
1779                 Hangul.decompose(c, buffer);
1780             } else if(isDecompNoAlgorithmic(norm16)) {
1781                 c=mapAlgorithmic(c, norm16);
1782                 norm16=getNorm16(c);
1783                 continue;
1784             } else {
1785                 // c decomposes, get everything from the variable-length extra data
1786                 int firstUnit=extraData.charAt(norm16);
1787                 int length=firstUnit&MAPPING_LENGTH_MASK;
1788                 int leadCC, trailCC;
1789                 trailCC=firstUnit>>8;
1790                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
1791                     leadCC=extraData.charAt(norm16-1)>>8;
1792                 } else {
1793                     leadCC=0;
1794                 }
1795                 ++norm16;  // skip over the firstUnit
1796                 buffer.append(extraData, norm16, norm16+length, leadCC, trailCC);
1797             }
1798             return;
1799         }
1800     }
1801 
1802     /**
1803      * Finds the recomposition result for
1804      * a forward-combining "lead" character,
1805      * specified with a pointer to its compositions list,
1806      * and a backward-combining "trail" character.
1807      *
1808      * <p>If the lead and trail characters combine, then this function returns
1809      * the following "compositeAndFwd" value:
1810      * <pre>
1811      * Bits 21..1  composite character
1812      * Bit      0  set if the composite is a forward-combining starter
1813      * </pre>
1814      * otherwise it returns -1.
1815      *
1816      * <p>The compositions list has (trail, compositeAndFwd) pair entries,
1817      * encoded as either pairs or triples of 16-bit units.
1818      * The last entry has the high bit of its first unit set.
1819      *
1820      * <p>The list is sorted by ascending trail characters (there are no duplicates).
1821      * A linear search is used.
1822      *
1823      * <p>See normalizer2impl.h for a more detailed description
1824      * of the compositions list format.
1825      */
combine(String compositions, int list, int trail)1826     private static int combine(String compositions, int list, int trail) {
1827         int key1, firstUnit;
1828         if(trail<COMP_1_TRAIL_LIMIT) {
1829             // trail character is 0..33FF
1830             // result entry may have 2 or 3 units
1831             key1=(trail<<1);
1832             while(key1>(firstUnit=compositions.charAt(list))) {
1833                 list+=2+(firstUnit&COMP_1_TRIPLE);
1834             }
1835             if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1836                 if((firstUnit&COMP_1_TRIPLE)!=0) {
1837                     return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2);
1838                 } else {
1839                     return compositions.charAt(list+1);
1840                 }
1841             }
1842         } else {
1843             // trail character is 3400..10FFFF
1844             // result entry has 3 units
1845             key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE);
1846             int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff;
1847             int secondUnit;
1848             for(;;) {
1849                 if(key1>(firstUnit=compositions.charAt(list))) {
1850                     list+=2+(firstUnit&COMP_1_TRIPLE);
1851                 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) {
1852                     if(key2>(secondUnit=compositions.charAt(list+1))) {
1853                         if((firstUnit&COMP_1_LAST_TUPLE)!=0) {
1854                             break;
1855                         } else {
1856                             list+=3;
1857                         }
1858                     } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) {
1859                         return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2);
1860                     } else {
1861                         break;
1862                     }
1863                 } else {
1864                     break;
1865                 }
1866             }
1867         }
1868         return -1;
1869     }
1870     /**
1871      * @param list some character's compositions list
1872      * @param set recursively receives the composites from these compositions
1873      */
addComposites(int list, UnicodeSet set)1874     private void addComposites(int list, UnicodeSet set) {
1875         int firstUnit, compositeAndFwd;
1876         do {
1877             firstUnit=maybeYesCompositions.charAt(list);
1878             if((firstUnit&COMP_1_TRIPLE)==0) {
1879                 compositeAndFwd=maybeYesCompositions.charAt(list+1);
1880                 list+=2;
1881             } else {
1882                 compositeAndFwd=(((int)maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
1883                                 maybeYesCompositions.charAt(list+2);
1884                 list+=3;
1885             }
1886             int composite=compositeAndFwd>>1;
1887             if((compositeAndFwd&1)!=0) {
1888                 addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
1889             }
1890             set.add(composite);
1891         } while((firstUnit&COMP_1_LAST_TUPLE)==0);
1892     }
1893     /*
1894      * Recomposes the buffer text starting at recomposeStartIndex
1895      * (which is in NFD - decomposed and canonically ordered),
1896      * and truncates the buffer contents.
1897      *
1898      * Note that recomposition never lengthens the text:
1899      * Any character consists of either one or two code units;
1900      * a composition may contain at most one more code unit than the original starter,
1901      * while the combining mark that is removed has at least one code unit.
1902      */
recompose(ReorderingBuffer buffer, int recomposeStartIndex, boolean onlyContiguous)1903     private void recompose(ReorderingBuffer buffer, int recomposeStartIndex,
1904                            boolean onlyContiguous) {
1905         StringBuilder sb=buffer.getStringBuilder();
1906         int p=recomposeStartIndex;
1907         if(p==sb.length()) {
1908             return;
1909         }
1910 
1911         int starter, pRemove;
1912         int compositionsList;
1913         int c, compositeAndFwd;
1914         int norm16;
1915         int cc, prevCC;
1916         boolean starterIsSupplementary;
1917 
1918         // Some of the following variables are not used until we have a forward-combining starter
1919         // and are only initialized now to avoid compiler warnings.
1920         compositionsList=-1;  // used as indicator for whether we have a forward-combining starter
1921         starter=-1;
1922         starterIsSupplementary=false;
1923         prevCC=0;
1924 
1925         for(;;) {
1926             c=sb.codePointAt(p);
1927             p+=Character.charCount(c);
1928             norm16=getNorm16(c);
1929             cc=getCCFromYesOrMaybe(norm16);
1930             if( // this character combines backward and
1931                 isMaybe(norm16) &&
1932                 // we have seen a starter that combines forward and
1933                 compositionsList>=0 &&
1934                 // the backward-combining character is not blocked
1935                 (prevCC<cc || prevCC==0)
1936             ) {
1937                 if(isJamoVT(norm16)) {
1938                     // c is a Jamo V/T, see if we can compose it with the previous character.
1939                     if(c<Hangul.JAMO_T_BASE) {
1940                         // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T.
1941                         char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE);
1942                         if(prev<Hangul.JAMO_L_COUNT) {
1943                             pRemove=p-1;
1944                             char syllable=(char)
1945                                 (Hangul.HANGUL_BASE+
1946                                  (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))*
1947                                  Hangul.JAMO_T_COUNT);
1948                             char t;
1949                             if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) {
1950                                 ++p;
1951                                 syllable+=t;  // The next character was a Jamo T.
1952                             }
1953                             sb.setCharAt(starter, syllable);
1954                             // remove the Jamo V/T
1955                             sb.delete(pRemove, p);
1956                             p=pRemove;
1957                         }
1958                     }
1959                     /*
1960                      * No "else" for Jamo T:
1961                      * Since the input is in NFD, there are no Hangul LV syllables that
1962                      * a Jamo T could combine with.
1963                      * All Jamo Ts are combined above when handling Jamo Vs.
1964                      */
1965                     if(p==sb.length()) {
1966                         break;
1967                     }
1968                     compositionsList=-1;
1969                     continue;
1970                 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) {
1971                     // The starter and the combining mark (c) do combine.
1972                     int composite=compositeAndFwd>>1;
1973 
1974                     // Remove the combining mark.
1975                     pRemove=p-Character.charCount(c);  // pRemove & p: start & limit of the combining mark
1976                     sb.delete(pRemove, p);
1977                     p=pRemove;
1978                     // Replace the starter with the composite.
1979                     if(starterIsSupplementary) {
1980                         if(composite>0xffff) {
1981                             // both are supplementary
1982                             sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1983                             sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite));
1984                         } else {
1985                             sb.setCharAt(starter, (char)c);
1986                             sb.deleteCharAt(starter+1);
1987                             // The composite is shorter than the starter,
1988                             // move the intermediate characters forward one.
1989                             starterIsSupplementary=false;
1990                             --p;
1991                         }
1992                     } else if(composite>0xffff) {
1993                         // The composite is longer than the starter,
1994                         // move the intermediate characters back one.
1995                         starterIsSupplementary=true;
1996                         sb.setCharAt(starter, UTF16.getLeadSurrogate(composite));
1997                         sb.insert(starter+1, UTF16.getTrailSurrogate(composite));
1998                         ++p;
1999                     } else {
2000                         // both are on the BMP
2001                         sb.setCharAt(starter, (char)composite);
2002                     }
2003 
2004                     // Keep prevCC because we removed the combining mark.
2005 
2006                     if(p==sb.length()) {
2007                         break;
2008                     }
2009                     // Is the composite a starter that combines forward?
2010                     if((compositeAndFwd&1)!=0) {
2011                         compositionsList=
2012                             getCompositionsListForComposite(getNorm16(composite));
2013                     } else {
2014                         compositionsList=-1;
2015                     }
2016 
2017                     // We combined; continue with looking for compositions.
2018                     continue;
2019                 }
2020             }
2021 
2022             // no combination this time
2023             prevCC=cc;
2024             if(p==sb.length()) {
2025                 break;
2026             }
2027 
2028             // If c did not combine, then check if it is a starter.
2029             if(cc==0) {
2030                 // Found a new starter.
2031                 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
2032                     // It may combine with something, prepare for it.
2033                     if(c<=0xffff) {
2034                         starterIsSupplementary=false;
2035                         starter=p-1;
2036                     } else {
2037                         starterIsSupplementary=true;
2038                         starter=p-2;
2039                     }
2040                 }
2041             } else if(onlyContiguous) {
2042                 // FCC: no discontiguous compositions; any intervening character blocks.
2043                 compositionsList=-1;
2044             }
2045         }
2046         buffer.flush();
2047     }
2048 
composePair(int a, int b)2049     public int composePair(int a, int b) {
2050         int norm16=getNorm16(a);  // maps an out-of-range 'a' to inert norm16=0
2051         int list;
2052         if(isInert(norm16)) {
2053             return -1;
2054         } else if(norm16<minYesNoMappingsOnly) {
2055             if(isJamoL(norm16)) {
2056                 b-=Hangul.JAMO_V_BASE;
2057                 if(0<=b && b<Hangul.JAMO_V_COUNT) {
2058                     return
2059                         (Hangul.HANGUL_BASE+
2060                          ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)*
2061                          Hangul.JAMO_T_COUNT);
2062                 } else {
2063                     return -1;
2064                 }
2065             } else if(isHangul(norm16)) {
2066                 b-=Hangul.JAMO_T_BASE;
2067                 if(Hangul.isHangulWithoutJamoT((char)a) && 0<b && b<Hangul.JAMO_T_COUNT) {  // not b==0!
2068                     return a+b;
2069                 } else {
2070                     return -1;
2071                 }
2072             } else {
2073                 // 'a' has a compositions list in extraData
2074                 list=norm16;
2075                 if(norm16>minYesNo) {  // composite 'a' has both mapping & compositions list
2076                     list+=  // mapping pointer
2077                         1+  // +1 to skip the first unit with the mapping lenth
2078                         (extraData.charAt(list)&MAPPING_LENGTH_MASK);  // + mapping length
2079                 }
2080                 // Turn the offset-into-extraData into an offset-into-maybeYesCompositions.
2081                 list+=MIN_NORMAL_MAYBE_YES-minMaybeYes;
2082             }
2083         } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
2084             return -1;
2085         } else {
2086             list=norm16-minMaybeYes;  // offset into maybeYesCompositions
2087         }
2088         if(b<0 || 0x10ffff<b) {  // combine(list, b) requires a valid code point b
2089             return -1;
2090         }
2091         return combine(maybeYesCompositions, list, b)>>1;
2092     }
2093 
2094     /**
2095      * Does c have a composition boundary before it?
2096      * True if its decomposition begins with a character that has
2097      * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()).
2098      * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes
2099      * (isCompYesAndZeroCC()) so we need not decompose.
2100      */
hasCompBoundaryBefore(int c, int norm16)2101     private boolean hasCompBoundaryBefore(int c, int norm16) {
2102         for(;;) {
2103             if(isCompYesAndZeroCC(norm16)) {
2104                 return true;
2105             } else if(isMaybeOrNonZeroCC(norm16)) {
2106                 return false;
2107             } else if(isDecompNoAlgorithmic(norm16)) {
2108                 c=mapAlgorithmic(c, norm16);
2109                 norm16=getNorm16(c);
2110             } else {
2111                 // c decomposes, get everything from the variable-length extra data
2112                 int firstUnit=extraData.charAt(norm16);
2113                 if((firstUnit&MAPPING_LENGTH_MASK)==0) {
2114                     return false;
2115                 }
2116                 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) {
2117                     return false;  // non-zero leadCC
2118                 }
2119                 return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1)));
2120             }
2121         }
2122     }
findPreviousCompBoundary(CharSequence s, int p)2123     private int findPreviousCompBoundary(CharSequence s, int p) {
2124         while(p>0) {
2125             int c=Character.codePointBefore(s, p);
2126             p-=Character.charCount(c);
2127             if(hasCompBoundaryBefore(c)) {
2128                 break;
2129             }
2130             // We could also test hasCompBoundaryAfter() and return iter.codePointLimit,
2131             // but that's probably not worth the extra cost.
2132         }
2133         return p;
2134     }
findNextCompBoundary(CharSequence s, int p, int limit)2135     private int findNextCompBoundary(CharSequence s, int p, int limit) {
2136         while(p<limit) {
2137             int c=Character.codePointAt(s, p);
2138             int norm16=normTrie.get(c);
2139             if(hasCompBoundaryBefore(c, norm16)) {
2140                 break;
2141             }
2142             p+=Character.charCount(c);
2143         }
2144         return p;
2145     }
2146 
findPreviousFCDBoundary(CharSequence s, int p)2147     private int findPreviousFCDBoundary(CharSequence s, int p) {
2148         while(p>0) {
2149             int c=Character.codePointBefore(s, p);
2150             p-=Character.charCount(c);
2151             if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) {
2152                 break;
2153             }
2154         }
2155         return p;
2156     }
findNextFCDBoundary(CharSequence s, int p, int limit)2157     private int findNextFCDBoundary(CharSequence s, int p, int limit) {
2158         while(p<limit) {
2159             int c=Character.codePointAt(s, p);
2160             if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) {
2161                 break;
2162             }
2163             p+=Character.charCount(c);
2164         }
2165         return p;
2166     }
2167 
addToStartSet(Trie2Writable newData, int origin, int decompLead)2168     private void addToStartSet(Trie2Writable newData, int origin, int decompLead) {
2169         int canonValue=newData.get(decompLead);
2170         if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
2171             // origin is the first character whose decomposition starts with
2172             // the character for which we are setting the value.
2173             newData.set(decompLead, canonValue|origin);
2174         } else {
2175             // origin is not the first character, or it is U+0000.
2176             UnicodeSet set;
2177             if((canonValue&CANON_HAS_SET)==0) {
2178                 int firstOrigin=canonValue&CANON_VALUE_MASK;
2179                 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size();
2180                 newData.set(decompLead, canonValue);
2181                 canonStartSets.add(set=new UnicodeSet());
2182                 if(firstOrigin!=0) {
2183                     set.add(firstOrigin);
2184                 }
2185             } else {
2186                 set=canonStartSets.get(canonValue&CANON_VALUE_MASK);
2187             }
2188             set.add(origin);
2189         }
2190     }
2191 
2192     @SuppressWarnings("unused")
2193     private VersionInfo dataVersion;
2194 
2195     // Code point thresholds for quick check codes.
2196     private int minDecompNoCP;
2197     private int minCompNoMaybeCP;
2198 
2199     // Norm16 value thresholds for quick check combinations and types of extra data.
2200     private int minYesNo;
2201     private int minYesNoMappingsOnly;
2202     private int minNoNo;
2203     private int limitNoNo;
2204     private int minMaybeYes;
2205 
2206     private Trie2_16 normTrie;
2207     private String maybeYesCompositions;
2208     private String extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
2209     private byte[] smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
2210     private int[] tccc180;  // [0x180] tccc values for U+0000..U+017F
2211 
2212     private Trie2_32 canonIterData;
2213     private ArrayList<UnicodeSet> canonStartSets;
2214 
2215     // bits in canonIterData
2216     private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000;
2217     private static final int CANON_HAS_COMPOSITIONS = 0x40000000;
2218     private static final int CANON_HAS_SET = 0x200000;
2219     private static final int CANON_VALUE_MASK = 0x1fffff;
2220 }
2221