1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2013-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * TailoredSet.java, ported from collationsets.h/.cpp 9 * 10 * C++ version created on: 2013feb09 11 * created by: Markus W. Scherer 12 */ 13 14 package com.ibm.icu.impl.coll; 15 16 import java.util.Iterator; 17 18 import com.ibm.icu.impl.Normalizer2Impl.Hangul; 19 import com.ibm.icu.impl.Trie2; 20 import com.ibm.icu.impl.Utility; 21 import com.ibm.icu.text.UnicodeSet; 22 import com.ibm.icu.util.CharsTrie; 23 import com.ibm.icu.util.CharsTrie.Entry; 24 25 /** 26 * Finds the set of characters and strings that sort differently in the tailoring 27 * from the base data. 28 * 29 * Every mapping in the tailoring needs to be compared to the base, 30 * because some mappings are copied for optimization, and 31 * all contractions for a character are copied if any contractions for that character 32 * are added, modified or removed. 33 * 34 * It might be simpler to re-parse the rule string, but: 35 * - That would require duplicating some of the from-rules builder code. 36 * - That would make the runtime code depend on the builder. 37 * - That would only work if we have the rule string, and we allow users to 38 * omit the rule string from data files. 39 */ 40 public final class TailoredSet { 41 42 private CollationData data; 43 private CollationData baseData; 44 private UnicodeSet tailored; 45 private StringBuilder unreversedPrefix = new StringBuilder(); 46 private String suffix; 47 TailoredSet(UnicodeSet t)48 public TailoredSet(UnicodeSet t) { 49 tailored = t; 50 } 51 forData(CollationData d)52 public void forData(CollationData d) { 53 data = d; 54 baseData = d.base; 55 assert (baseData != null); 56 // utrie2_enum(data->trie, NULL, enumTailoredRange, this); 57 Iterator<Trie2.Range> trieIterator = data.trie.iterator(); 58 Trie2.Range range; 59 while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) { 60 enumTailoredRange(range.startCodePoint, range.endCodePoint, range.value, this); 61 } 62 } 63 enumTailoredRange(int start, int end, int ce32, TailoredSet ts)64 private void enumTailoredRange(int start, int end, int ce32, TailoredSet ts) { 65 if (ce32 == Collation.FALLBACK_CE32) { 66 return; // fallback to base, not tailored 67 } 68 ts.handleCE32(start, end, ce32); 69 } 70 71 // Java porting note: ICU4C returns U_SUCCESS(error) and it's not applicable to ICU4J. 72 // Also, ICU4C requires handleCE32() to be public because it is used by the callback 73 // function (enumTailoredRange()). This is not necessary for Java implementation. handleCE32(int start, int end, int ce32)74 private void handleCE32(int start, int end, int ce32) { 75 assert (ce32 != Collation.FALLBACK_CE32); 76 if (Collation.isSpecialCE32(ce32)) { 77 ce32 = data.getIndirectCE32(ce32); 78 if (ce32 == Collation.FALLBACK_CE32) { 79 return; 80 } 81 } 82 do { 83 int baseCE32 = baseData.getFinalCE32(baseData.getCE32(start)); 84 // Do not just continue if ce32 == baseCE32 because 85 // contractions and expansions in different data objects 86 // normally differ even if they have the same data offsets. 87 if (Collation.isSelfContainedCE32(ce32) && Collation.isSelfContainedCE32(baseCE32)) { 88 // fastpath 89 if (ce32 != baseCE32) { 90 tailored.add(start); 91 } 92 } else { 93 compare(start, ce32, baseCE32); 94 } 95 } while (++start <= end); 96 } 97 compare(int c, int ce32, int baseCE32)98 private void compare(int c, int ce32, int baseCE32) { 99 if (Collation.isPrefixCE32(ce32)) { 100 int dataIndex = Collation.indexFromCE32(ce32); 101 ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex)); 102 if (Collation.isPrefixCE32(baseCE32)) { 103 int baseIndex = Collation.indexFromCE32(baseCE32); 104 baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 105 comparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2); 106 } else { 107 addPrefixes(data, c, data.contexts, dataIndex + 2); 108 } 109 } else if (Collation.isPrefixCE32(baseCE32)) { 110 int baseIndex = Collation.indexFromCE32(baseCE32); 111 baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 112 addPrefixes(baseData, c, baseData.contexts, baseIndex + 2); 113 } 114 115 if (Collation.isContractionCE32(ce32)) { 116 int dataIndex = Collation.indexFromCE32(ce32); 117 if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 118 ce32 = Collation.NO_CE32; 119 } else { 120 ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex)); 121 } 122 if (Collation.isContractionCE32(baseCE32)) { 123 int baseIndex = Collation.indexFromCE32(baseCE32); 124 if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 125 baseCE32 = Collation.NO_CE32; 126 } else { 127 baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 128 } 129 compareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2); 130 } else { 131 addContractions(c, data.contexts, dataIndex + 2); 132 } 133 } else if (Collation.isContractionCE32(baseCE32)) { 134 int baseIndex = Collation.indexFromCE32(baseCE32); 135 baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 136 addContractions(c, baseData.contexts, baseIndex + 2); 137 } 138 139 int tag; 140 if (Collation.isSpecialCE32(ce32)) { 141 tag = Collation.tagFromCE32(ce32); 142 assert (tag != Collation.PREFIX_TAG); 143 assert (tag != Collation.CONTRACTION_TAG); 144 // Currently, the tailoring data builder does not write offset tags. 145 // They might be useful for saving space, 146 // but they would complicate the builder, 147 // and in tailorings we assume that performance of tailored characters is more important. 148 assert (tag != Collation.OFFSET_TAG); 149 } else { 150 tag = -1; 151 } 152 int baseTag; 153 if (Collation.isSpecialCE32(baseCE32)) { 154 baseTag = Collation.tagFromCE32(baseCE32); 155 assert (baseTag != Collation.PREFIX_TAG); 156 assert (baseTag != Collation.CONTRACTION_TAG); 157 } else { 158 baseTag = -1; 159 } 160 161 // Non-contextual mappings, expansions, etc. 162 if (baseTag == Collation.OFFSET_TAG) { 163 // We might be comparing a tailoring CE which is a copy of 164 // a base offset-tag CE, via the [optimize [set]] syntax 165 // or when a single-character mapping was copied for tailored contractions. 166 // Offset tags always result in long-primary CEs, 167 // with common secondary/tertiary weights. 168 if (!Collation.isLongPrimaryCE32(ce32)) { 169 add(c); 170 return; 171 } 172 long dataCE = baseData.ces[Collation.indexFromCE32(baseCE32)]; 173 long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE); 174 if (Collation.primaryFromLongPrimaryCE32(ce32) != p) { 175 add(c); 176 return; 177 } 178 } 179 180 if (tag != baseTag) { 181 add(c); 182 return; 183 } 184 185 if (tag == Collation.EXPANSION32_TAG) { 186 int length = Collation.lengthFromCE32(ce32); 187 int baseLength = Collation.lengthFromCE32(baseCE32); 188 189 if (length != baseLength) { 190 add(c); 191 return; 192 } 193 194 int idx0 = Collation.indexFromCE32(ce32); 195 int idx1 = Collation.indexFromCE32(baseCE32); 196 197 for (int i = 0; i < length; ++i) { 198 if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i]) { 199 add(c); 200 break; 201 } 202 } 203 } else if (tag == Collation.EXPANSION_TAG) { 204 int length = Collation.lengthFromCE32(ce32); 205 int baseLength = Collation.lengthFromCE32(baseCE32); 206 207 if (length != baseLength) { 208 add(c); 209 return; 210 } 211 212 int idx0 = Collation.indexFromCE32(ce32); 213 int idx1 = Collation.indexFromCE32(baseCE32); 214 215 for (int i = 0; i < length; ++i) { 216 if (data.ces[idx0 + i] != baseData.ces[idx1 + i]) { 217 add(c); 218 break; 219 } 220 } 221 } else if (tag == Collation.HANGUL_TAG) { 222 StringBuilder jamos = new StringBuilder(); 223 int length = Hangul.decompose(c, jamos); 224 if (tailored.contains(jamos.charAt(0)) || tailored.contains(jamos.charAt(1)) 225 || (length == 3 && tailored.contains(jamos.charAt(2)))) { 226 add(c); 227 } 228 } else if (ce32 != baseCE32) { 229 add(c); 230 } 231 } 232 comparePrefixes(int c, CharSequence p, int pidx, CharSequence q, int qidx)233 private void comparePrefixes(int c, CharSequence p, int pidx, CharSequence q, int qidx) { 234 // Parallel iteration over prefixes of both tables. 235 CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator(); 236 CharsTrie.Iterator basePrefixes = new CharsTrie(q, qidx).iterator(); 237 String tp = null; // Tailoring prefix. 238 String bp = null; // Base prefix. 239 // Use a string with a U+FFFF as the limit sentinel. 240 // U+FFFF is untailorable and will not occur in prefixes. 241 String none = "\uffff"; 242 Entry te = null, be = null; 243 for (;;) { 244 if (tp == null) { 245 if (prefixes.hasNext()) { 246 te = prefixes.next(); 247 tp = te.chars.toString(); 248 } else { 249 te = null; 250 tp = none; 251 } 252 } 253 if (bp == null) { 254 if (basePrefixes.hasNext()) { 255 be = basePrefixes.next(); 256 bp = be.chars.toString(); 257 } else { 258 be = null; 259 bp = none; 260 } 261 } 262 if (Utility.sameObjects(tp, none) && Utility.sameObjects(bp, none)) { 263 break; 264 } 265 int cmp = tp.compareTo(bp); 266 if (cmp < 0) { 267 // tp occurs in the tailoring but not in the base. 268 assert (te != null); 269 addPrefix(data, tp, c, te.value); 270 te = null; 271 tp = null; 272 } else if (cmp > 0) { 273 // bp occurs in the base but not in the tailoring. 274 assert (be != null); 275 addPrefix(baseData, bp, c, be.value); 276 be = null; 277 bp = null; 278 } else { 279 setPrefix(tp); 280 assert (te != null && be != null); 281 compare(c, te.value, be.value); 282 resetPrefix(); 283 te = be = null; 284 tp = bp = null; 285 } 286 } 287 } 288 compareContractions(int c, CharSequence p, int pidx, CharSequence q, int qidx)289 private void compareContractions(int c, CharSequence p, int pidx, CharSequence q, int qidx) { 290 // Parallel iteration over suffixes of both tables. 291 CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator(); 292 CharsTrie.Iterator baseSuffixes = new CharsTrie(q, qidx).iterator(); 293 String ts = null; // Tailoring suffix. 294 String bs = null; // Base suffix. 295 // Use a string with two U+FFFF as the limit sentinel. 296 // U+FFFF is untailorable and will not occur in contractions except maybe 297 // as a single suffix character for a root-collator boundary contraction. 298 String none = "\uffff\uffff"; 299 Entry te = null, be = null; 300 for (;;) { 301 if (ts == null) { 302 if (suffixes.hasNext()) { 303 te = suffixes.next(); 304 ts = te.chars.toString(); 305 } else { 306 te = null; 307 ts = none; 308 } 309 } 310 if (bs == null) { 311 if (baseSuffixes.hasNext()) { 312 be = baseSuffixes.next(); 313 bs = be.chars.toString(); 314 } else { 315 be = null; 316 bs = none; 317 } 318 } 319 if (Utility.sameObjects(ts, none) && Utility.sameObjects(bs, none)) { 320 break; 321 } 322 int cmp = ts.compareTo(bs); 323 if (cmp < 0) { 324 // ts occurs in the tailoring but not in the base. 325 addSuffix(c, ts); 326 te = null; 327 ts = null; 328 } else if (cmp > 0) { 329 // bs occurs in the base but not in the tailoring. 330 addSuffix(c, bs); 331 be = null; 332 bs = null; 333 } else { 334 suffix = ts; 335 compare(c, te.value, be.value); 336 suffix = null; 337 te = be = null; 338 ts = bs = null; 339 } 340 } 341 } 342 addPrefixes(CollationData d, int c, CharSequence p, int pidx)343 private void addPrefixes(CollationData d, int c, CharSequence p, int pidx) { 344 CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator(); 345 while (prefixes.hasNext()) { 346 Entry e = prefixes.next(); 347 addPrefix(d, e.chars, c, e.value); 348 } 349 } 350 addPrefix(CollationData d, CharSequence pfx, int c, int ce32)351 private void addPrefix(CollationData d, CharSequence pfx, int c, int ce32) { 352 setPrefix(pfx); 353 ce32 = d.getFinalCE32(ce32); 354 if (Collation.isContractionCE32(ce32)) { 355 int idx = Collation.indexFromCE32(ce32); 356 addContractions(c, d.contexts, idx + 2); 357 } 358 tailored.add(new StringBuilder(unreversedPrefix.appendCodePoint(c))); 359 resetPrefix(); 360 } 361 addContractions(int c, CharSequence p, int pidx)362 private void addContractions(int c, CharSequence p, int pidx) { 363 CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator(); 364 while (suffixes.hasNext()) { 365 Entry e = suffixes.next(); 366 addSuffix(c, e.chars); 367 } 368 } 369 addSuffix(int c, CharSequence sfx)370 private void addSuffix(int c, CharSequence sfx) { 371 tailored.add(new StringBuilder(unreversedPrefix).appendCodePoint(c).append(sfx)); 372 } 373 add(int c)374 private void add(int c) { 375 if (unreversedPrefix.length() == 0 && suffix == null) { 376 tailored.add(c); 377 } else { 378 StringBuilder s = new StringBuilder(unreversedPrefix); 379 s.appendCodePoint(c); 380 if (suffix != null) { 381 s.append(suffix); 382 } 383 tailored.add(s); 384 } 385 } 386 387 // Prefixes are reversed in the data structure. setPrefix(CharSequence pfx)388 private void setPrefix(CharSequence pfx) { 389 unreversedPrefix.setLength(0); 390 unreversedPrefix.append(pfx).reverse(); 391 } 392 resetPrefix()393 private void resetPrefix() { 394 unreversedPrefix.setLength(0); 395 } 396 } 397 398