1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2014, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * TailoredSet.java, ported from collationsets.h/.cpp
9 *
10 * C++ version created on: 2013feb09
11 * created by: Markus W. Scherer
12 */
13 
14 package com.ibm.icu.impl.coll;
15 
16 import java.util.Iterator;
17 
18 import com.ibm.icu.impl.Normalizer2Impl.Hangul;
19 import com.ibm.icu.impl.Trie2;
20 import com.ibm.icu.impl.Utility;
21 import com.ibm.icu.text.UnicodeSet;
22 import com.ibm.icu.util.CharsTrie;
23 import com.ibm.icu.util.CharsTrie.Entry;
24 
25 /**
26  * Finds the set of characters and strings that sort differently in the tailoring
27  * from the base data.
28  *
29  * Every mapping in the tailoring needs to be compared to the base,
30  * because some mappings are copied for optimization, and
31  * all contractions for a character are copied if any contractions for that character
32  * are added, modified or removed.
33  *
34  * It might be simpler to re-parse the rule string, but:
35  * - That would require duplicating some of the from-rules builder code.
36  * - That would make the runtime code depend on the builder.
37  * - That would only work if we have the rule string, and we allow users to
38  *   omit the rule string from data files.
39  */
40 public final class TailoredSet {
41 
42     private CollationData data;
43     private CollationData baseData;
44     private UnicodeSet tailored;
45     private StringBuilder unreversedPrefix = new StringBuilder();
46     private String suffix;
47 
TailoredSet(UnicodeSet t)48     public TailoredSet(UnicodeSet t) {
49         tailored = t;
50     }
51 
forData(CollationData d)52     public void forData(CollationData d) {
53         data = d;
54         baseData = d.base;
55         assert (baseData != null);
56         // utrie2_enum(data->trie, NULL, enumTailoredRange, this);
57         Iterator<Trie2.Range> trieIterator = data.trie.iterator();
58         Trie2.Range range;
59         while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
60             enumTailoredRange(range.startCodePoint, range.endCodePoint, range.value, this);
61         }
62     }
63 
enumTailoredRange(int start, int end, int ce32, TailoredSet ts)64     private void enumTailoredRange(int start, int end, int ce32, TailoredSet ts) {
65         if (ce32 == Collation.FALLBACK_CE32) {
66             return; // fallback to base, not tailored
67         }
68         ts.handleCE32(start, end, ce32);
69     }
70 
71     // Java porting note: ICU4C returns U_SUCCESS(error) and it's not applicable to ICU4J.
72     //  Also, ICU4C requires handleCE32() to be public because it is used by the callback
73     //  function (enumTailoredRange()). This is not necessary for Java implementation.
handleCE32(int start, int end, int ce32)74     private void handleCE32(int start, int end, int ce32) {
75         assert (ce32 != Collation.FALLBACK_CE32);
76         if (Collation.isSpecialCE32(ce32)) {
77             ce32 = data.getIndirectCE32(ce32);
78             if (ce32 == Collation.FALLBACK_CE32) {
79                 return;
80             }
81         }
82         do {
83             int baseCE32 = baseData.getFinalCE32(baseData.getCE32(start));
84             // Do not just continue if ce32 == baseCE32 because
85             // contractions and expansions in different data objects
86             // normally differ even if they have the same data offsets.
87             if (Collation.isSelfContainedCE32(ce32) && Collation.isSelfContainedCE32(baseCE32)) {
88                 // fastpath
89                 if (ce32 != baseCE32) {
90                     tailored.add(start);
91                 }
92             } else {
93                 compare(start, ce32, baseCE32);
94             }
95         } while (++start <= end);
96     }
97 
compare(int c, int ce32, int baseCE32)98     private void compare(int c, int ce32, int baseCE32) {
99         if (Collation.isPrefixCE32(ce32)) {
100             int dataIndex = Collation.indexFromCE32(ce32);
101             ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex));
102             if (Collation.isPrefixCE32(baseCE32)) {
103                 int baseIndex = Collation.indexFromCE32(baseCE32);
104                 baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
105                 comparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
106             } else {
107                 addPrefixes(data, c, data.contexts, dataIndex + 2);
108             }
109         } else if (Collation.isPrefixCE32(baseCE32)) {
110             int baseIndex = Collation.indexFromCE32(baseCE32);
111             baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
112             addPrefixes(baseData, c, baseData.contexts, baseIndex + 2);
113         }
114 
115         if (Collation.isContractionCE32(ce32)) {
116             int dataIndex = Collation.indexFromCE32(ce32);
117             if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
118                 ce32 = Collation.NO_CE32;
119             } else {
120                 ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex));
121             }
122             if (Collation.isContractionCE32(baseCE32)) {
123                 int baseIndex = Collation.indexFromCE32(baseCE32);
124                 if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
125                     baseCE32 = Collation.NO_CE32;
126                 } else {
127                     baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
128                 }
129                 compareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
130             } else {
131                 addContractions(c, data.contexts, dataIndex + 2);
132             }
133         } else if (Collation.isContractionCE32(baseCE32)) {
134             int baseIndex = Collation.indexFromCE32(baseCE32);
135             baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
136             addContractions(c, baseData.contexts, baseIndex + 2);
137         }
138 
139         int tag;
140         if (Collation.isSpecialCE32(ce32)) {
141             tag = Collation.tagFromCE32(ce32);
142             assert (tag != Collation.PREFIX_TAG);
143             assert (tag != Collation.CONTRACTION_TAG);
144             // Currently, the tailoring data builder does not write offset tags.
145             // They might be useful for saving space,
146             // but they would complicate the builder,
147             // and in tailorings we assume that performance of tailored characters is more important.
148             assert (tag != Collation.OFFSET_TAG);
149         } else {
150             tag = -1;
151         }
152         int baseTag;
153         if (Collation.isSpecialCE32(baseCE32)) {
154             baseTag = Collation.tagFromCE32(baseCE32);
155             assert (baseTag != Collation.PREFIX_TAG);
156             assert (baseTag != Collation.CONTRACTION_TAG);
157         } else {
158             baseTag = -1;
159         }
160 
161         // Non-contextual mappings, expansions, etc.
162         if (baseTag == Collation.OFFSET_TAG) {
163             // We might be comparing a tailoring CE which is a copy of
164             // a base offset-tag CE, via the [optimize [set]] syntax
165             // or when a single-character mapping was copied for tailored contractions.
166             // Offset tags always result in long-primary CEs,
167             // with common secondary/tertiary weights.
168             if (!Collation.isLongPrimaryCE32(ce32)) {
169                 add(c);
170                 return;
171             }
172             long dataCE = baseData.ces[Collation.indexFromCE32(baseCE32)];
173             long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE);
174             if (Collation.primaryFromLongPrimaryCE32(ce32) != p) {
175                 add(c);
176                 return;
177             }
178         }
179 
180         if (tag != baseTag) {
181             add(c);
182             return;
183         }
184 
185         if (tag == Collation.EXPANSION32_TAG) {
186             int length = Collation.lengthFromCE32(ce32);
187             int baseLength = Collation.lengthFromCE32(baseCE32);
188 
189             if (length != baseLength) {
190                 add(c);
191                 return;
192             }
193 
194             int idx0 = Collation.indexFromCE32(ce32);
195             int idx1 = Collation.indexFromCE32(baseCE32);
196 
197             for (int i = 0; i < length; ++i) {
198                 if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i]) {
199                     add(c);
200                     break;
201                 }
202             }
203         } else if (tag == Collation.EXPANSION_TAG) {
204             int length = Collation.lengthFromCE32(ce32);
205             int baseLength = Collation.lengthFromCE32(baseCE32);
206 
207             if (length != baseLength) {
208                 add(c);
209                 return;
210             }
211 
212             int idx0 = Collation.indexFromCE32(ce32);
213             int idx1 = Collation.indexFromCE32(baseCE32);
214 
215             for (int i = 0; i < length; ++i) {
216                 if (data.ces[idx0 + i] != baseData.ces[idx1 + i]) {
217                     add(c);
218                     break;
219                 }
220             }
221         } else if (tag == Collation.HANGUL_TAG) {
222             StringBuilder jamos = new StringBuilder();
223             int length = Hangul.decompose(c, jamos);
224             if (tailored.contains(jamos.charAt(0)) || tailored.contains(jamos.charAt(1))
225                     || (length == 3 && tailored.contains(jamos.charAt(2)))) {
226                 add(c);
227             }
228         } else if (ce32 != baseCE32) {
229             add(c);
230         }
231     }
232 
comparePrefixes(int c, CharSequence p, int pidx, CharSequence q, int qidx)233     private void comparePrefixes(int c, CharSequence p, int pidx, CharSequence q, int qidx) {
234         // Parallel iteration over prefixes of both tables.
235         CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator();
236         CharsTrie.Iterator basePrefixes = new CharsTrie(q, qidx).iterator();
237         String tp = null; // Tailoring prefix.
238         String bp = null; // Base prefix.
239         // Use a string with a U+FFFF as the limit sentinel.
240         // U+FFFF is untailorable and will not occur in prefixes.
241         String none = "\uffff";
242         Entry te = null, be = null;
243         for (;;) {
244             if (tp == null) {
245                 if (prefixes.hasNext()) {
246                     te = prefixes.next();
247                     tp = te.chars.toString();
248                 } else {
249                     te = null;
250                     tp = none;
251                 }
252             }
253             if (bp == null) {
254                 if (basePrefixes.hasNext()) {
255                     be = basePrefixes.next();
256                     bp = be.chars.toString();
257                 } else {
258                     be = null;
259                     bp = none;
260                 }
261             }
262             if (Utility.sameObjects(tp, none) && Utility.sameObjects(bp, none)) {
263                 break;
264             }
265             int cmp = tp.compareTo(bp);
266             if (cmp < 0) {
267                 // tp occurs in the tailoring but not in the base.
268                 assert (te != null);
269                 addPrefix(data, tp, c, te.value);
270                 te = null;
271                 tp = null;
272             } else if (cmp > 0) {
273                 // bp occurs in the base but not in the tailoring.
274                 assert (be != null);
275                 addPrefix(baseData, bp, c, be.value);
276                 be = null;
277                 bp = null;
278             } else {
279                 setPrefix(tp);
280                 assert (te != null && be != null);
281                 compare(c, te.value, be.value);
282                 resetPrefix();
283                 te = be = null;
284                 tp = bp = null;
285             }
286         }
287     }
288 
compareContractions(int c, CharSequence p, int pidx, CharSequence q, int qidx)289     private void compareContractions(int c, CharSequence p, int pidx, CharSequence q, int qidx) {
290         // Parallel iteration over suffixes of both tables.
291         CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator();
292         CharsTrie.Iterator baseSuffixes = new CharsTrie(q, qidx).iterator();
293         String ts = null; // Tailoring suffix.
294         String bs = null; // Base suffix.
295         // Use a string with two U+FFFF as the limit sentinel.
296         // U+FFFF is untailorable and will not occur in contractions except maybe
297         // as a single suffix character for a root-collator boundary contraction.
298         String none = "\uffff\uffff";
299         Entry te = null, be = null;
300         for (;;) {
301             if (ts == null) {
302                 if (suffixes.hasNext()) {
303                     te = suffixes.next();
304                     ts = te.chars.toString();
305                 } else {
306                     te = null;
307                     ts = none;
308                 }
309             }
310             if (bs == null) {
311                 if (baseSuffixes.hasNext()) {
312                     be = baseSuffixes.next();
313                     bs = be.chars.toString();
314                 } else {
315                     be = null;
316                     bs = none;
317                 }
318             }
319             if (Utility.sameObjects(ts, none) && Utility.sameObjects(bs, none)) {
320                 break;
321             }
322             int cmp = ts.compareTo(bs);
323             if (cmp < 0) {
324                 // ts occurs in the tailoring but not in the base.
325                 addSuffix(c, ts);
326                 te = null;
327                 ts = null;
328             } else if (cmp > 0) {
329                 // bs occurs in the base but not in the tailoring.
330                 addSuffix(c, bs);
331                 be = null;
332                 bs = null;
333             } else {
334                 suffix = ts;
335                 compare(c, te.value, be.value);
336                 suffix = null;
337                 te = be = null;
338                 ts = bs = null;
339             }
340         }
341     }
342 
addPrefixes(CollationData d, int c, CharSequence p, int pidx)343     private void addPrefixes(CollationData d, int c, CharSequence p, int pidx) {
344         CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator();
345         while (prefixes.hasNext()) {
346             Entry e = prefixes.next();
347             addPrefix(d, e.chars, c, e.value);
348         }
349     }
350 
addPrefix(CollationData d, CharSequence pfx, int c, int ce32)351     private void addPrefix(CollationData d, CharSequence pfx, int c, int ce32) {
352         setPrefix(pfx);
353         ce32 = d.getFinalCE32(ce32);
354         if (Collation.isContractionCE32(ce32)) {
355             int idx = Collation.indexFromCE32(ce32);
356             addContractions(c, d.contexts, idx + 2);
357         }
358         tailored.add(new StringBuilder(unreversedPrefix.appendCodePoint(c)));
359         resetPrefix();
360     }
361 
addContractions(int c, CharSequence p, int pidx)362     private void addContractions(int c, CharSequence p, int pidx) {
363         CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator();
364         while (suffixes.hasNext()) {
365             Entry e = suffixes.next();
366             addSuffix(c, e.chars);
367         }
368     }
369 
addSuffix(int c, CharSequence sfx)370     private void addSuffix(int c, CharSequence sfx) {
371         tailored.add(new StringBuilder(unreversedPrefix).appendCodePoint(c).append(sfx));
372     }
373 
add(int c)374     private void add(int c) {
375         if (unreversedPrefix.length() == 0 && suffix == null) {
376             tailored.add(c);
377         } else {
378             StringBuilder s = new StringBuilder(unreversedPrefix);
379             s.appendCodePoint(c);
380             if (suffix != null) {
381                 s.append(suffix);
382             }
383             tailored.add(s);
384         }
385     }
386 
387     // Prefixes are reversed in the data structure.
setPrefix(CharSequence pfx)388     private void setPrefix(CharSequence pfx) {
389         unreversedPrefix.setLength(0);
390         unreversedPrefix.append(pfx).reverse();
391     }
392 
resetPrefix()393     private void resetPrefix() {
394         unreversedPrefix.setLength(0);
395     }
396 }
397 
398