1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * FCDUTF16CollationIterator.java, ported from utf16collationiterator.h/.cpp 9 * 10 * C++ version created on: 2010oct27 11 * created by: Markus W. Scherer 12 */ 13 14 package com.ibm.icu.impl.coll; 15 16 import com.ibm.icu.impl.Normalizer2Impl; 17 18 /** 19 * Incrementally checks the input text for FCD and normalizes where necessary. 20 */ 21 public final class FCDUTF16CollationIterator extends UTF16CollationIterator { 22 /** 23 * Partial constructor, see {@link CollationIterator#CollationIterator(CollationData)}. 24 */ FCDUTF16CollationIterator(CollationData d)25 public FCDUTF16CollationIterator(CollationData d) { 26 super(d); 27 nfcImpl = d.nfcImpl; 28 } 29 FCDUTF16CollationIterator(CollationData data, boolean numeric, CharSequence s, int p)30 public FCDUTF16CollationIterator(CollationData data, boolean numeric, CharSequence s, int p) { 31 super(data, numeric, s, p); 32 rawSeq = s; 33 segmentStart = p; 34 rawLimit = s.length(); 35 nfcImpl = data.nfcImpl; 36 checkDir = 1; 37 } 38 39 @Override equals(Object other)40 public boolean equals(Object other) { 41 // Skip the UTF16CollationIterator and call its parent. 42 if (!(other instanceof CollationIterator) 43 || !((CollationIterator)this).equals(other) 44 || !(other instanceof FCDUTF16CollationIterator)) 45 { 46 return false; 47 } 48 FCDUTF16CollationIterator o = (FCDUTF16CollationIterator)other; 49 // Compare the iterator state but not the text: Assume that the caller does that. 50 if (checkDir != o.checkDir) { 51 return false; 52 } 53 if (checkDir == 0 && (seq == rawSeq) != (o.seq == o.rawSeq)) { 54 return false; 55 } 56 if (checkDir != 0 || seq == rawSeq) { 57 return (pos - rawStart) == (o.pos - /*o.*/ rawStart); 58 } 59 else { 60 return (segmentStart - rawStart) == (o.segmentStart - /*o.*/ rawStart) && 61 (pos - start) == (o.pos - o.start); 62 } 63 } 64 65 @Override hashCode()66 public int hashCode() { 67 assert false : "hashCode not designed"; 68 return 42; // any arbitrary constant will do 69 } 70 71 @Override resetToOffset(int newOffset)72 public void resetToOffset(int newOffset) { 73 reset(); 74 seq = rawSeq; 75 start = segmentStart = pos = rawStart + newOffset; 76 limit = rawLimit; 77 checkDir = 1; 78 } 79 80 @Override getOffset()81 public int getOffset() { 82 if(checkDir != 0 || seq == rawSeq) { 83 return pos - rawStart; 84 } else if(pos == start) { 85 return segmentStart - rawStart; 86 } else { 87 return segmentLimit - rawStart; 88 } 89 } 90 91 @Override setText(boolean numeric, CharSequence s, int p)92 public void setText(boolean numeric, CharSequence s, int p) { 93 super.setText(numeric, s, p); 94 rawSeq = s; 95 segmentStart = p; 96 rawLimit = limit = s.length(); 97 checkDir = 1; 98 } 99 100 @Override nextCodePoint()101 public int nextCodePoint() { 102 char c; 103 for(;;) { 104 if(checkDir > 0) { 105 if(pos == limit) { 106 return Collation.SENTINEL_CP; 107 } 108 c = seq.charAt(pos++); 109 if(CollationFCD.hasTccc(c)) { 110 if(CollationFCD.maybeTibetanCompositeVowel(c) || 111 (pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) { 112 --pos; 113 nextSegment(); 114 c = seq.charAt(pos++); 115 } 116 } 117 break; 118 } else if(checkDir == 0 && pos != limit) { 119 c = seq.charAt(pos++); 120 break; 121 } else { 122 switchToForward(); 123 } 124 } 125 char trail; 126 if(Character.isHighSurrogate(c) && pos != limit && 127 Character.isLowSurrogate(trail = seq.charAt(pos))) { 128 ++pos; 129 return Character.toCodePoint(c, trail); 130 } else { 131 return c; 132 } 133 } 134 135 @Override previousCodePoint()136 public int previousCodePoint() { 137 char c; 138 for(;;) { 139 if(checkDir < 0) { 140 if(pos == start) { 141 return Collation.SENTINEL_CP; 142 } 143 c = seq.charAt(--pos); 144 if(CollationFCD.hasLccc(c)) { 145 if(CollationFCD.maybeTibetanCompositeVowel(c) || 146 (pos != start && CollationFCD.hasTccc(seq.charAt(pos - 1)))) { 147 ++pos; 148 previousSegment(); 149 c = seq.charAt(--pos); 150 } 151 } 152 break; 153 } else if(checkDir == 0 && pos != start) { 154 c = seq.charAt(--pos); 155 break; 156 } else { 157 switchToBackward(); 158 } 159 } 160 char lead; 161 if(Character.isLowSurrogate(c) && pos != start && 162 Character.isHighSurrogate(lead = seq.charAt(pos - 1))) { 163 --pos; 164 return Character.toCodePoint(lead, c); 165 } else { 166 return c; 167 } 168 } 169 170 @Override handleNextCE32()171 protected long handleNextCE32() { 172 char c; 173 for(;;) { 174 if(checkDir > 0) { 175 if(pos == limit) { 176 return NO_CP_AND_CE32; 177 } 178 c = seq.charAt(pos++); 179 if(CollationFCD.hasTccc(c)) { 180 if(CollationFCD.maybeTibetanCompositeVowel(c) || 181 (pos != limit && CollationFCD.hasLccc(seq.charAt(pos)))) { 182 --pos; 183 nextSegment(); 184 c = seq.charAt(pos++); 185 } 186 } 187 break; 188 } else if(checkDir == 0 && pos != limit) { 189 c = seq.charAt(pos++); 190 break; 191 } else { 192 switchToForward(); 193 } 194 } 195 return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead(c)); 196 } 197 198 /* boolean foundNULTerminator(); */ 199 200 @Override forwardNumCodePoints(int num)201 protected void forwardNumCodePoints(int num) { 202 // Specify the class to avoid a virtual-function indirection. 203 // In Java, we would declare this class final. 204 while(num > 0 && nextCodePoint() >= 0) { 205 --num; 206 } 207 } 208 209 @Override backwardNumCodePoints(int num)210 protected void backwardNumCodePoints(int num) { 211 // Specify the class to avoid a virtual-function indirection. 212 // In Java, we would declare this class final. 213 while(num > 0 && previousCodePoint() >= 0) { 214 --num; 215 } 216 } 217 218 /** 219 * Switches to forward checking if possible. 220 * To be called when checkDir < 0 || (checkDir == 0 && pos == limit). 221 * Returns with checkDir > 0 || (checkDir == 0 && pos != limit). 222 */ switchToForward()223 private void switchToForward() { 224 assert((checkDir < 0 && seq == rawSeq) || (checkDir == 0 && pos == limit)); 225 if(checkDir < 0) { 226 // Turn around from backward checking. 227 start = segmentStart = pos; 228 if(pos == segmentLimit) { 229 limit = rawLimit; 230 checkDir = 1; // Check forward. 231 } else { // pos < segmentLimit 232 checkDir = 0; // Stay in FCD segment. 233 } 234 } else { 235 // Reached the end of the FCD segment. 236 if(seq == rawSeq) { 237 // The input text segment is FCD, extend it forward. 238 } else { 239 // The input text segment needed to be normalized. 240 // Switch to checking forward from it. 241 seq = rawSeq; 242 pos = start = segmentStart = segmentLimit; 243 // Note: If this segment is at the end of the input text, 244 // then it might help to return false to indicate that, so that 245 // we do not have to re-check and normalize when we turn around and go backwards. 246 // However, that would complicate the call sites for an optimization of an unusual case. 247 } 248 limit = rawLimit; 249 checkDir = 1; 250 } 251 } 252 253 /** 254 * Extend the FCD text segment forward or normalize around pos. 255 * To be called when checkDir > 0 && pos != limit. 256 * Returns with checkDir == 0 and pos != limit. 257 */ 258 private void nextSegment() { 259 assert(checkDir > 0 && seq == rawSeq && pos != limit); 260 // The input text [segmentStart..pos[ passes the FCD check. 261 int p = pos; 262 int prevCC = 0; 263 for(;;) { 264 // Fetch the next character's fcd16 value. 265 int q = p; 266 int c = Character.codePointAt(seq, p); 267 p += Character.charCount(c); 268 int fcd16 = nfcImpl.getFCD16(c); 269 int leadCC = fcd16 >> 8; 270 if(leadCC == 0 && q != pos) { 271 // FCD boundary before the [q, p[ character. 272 limit = segmentLimit = q; 273 break; 274 } 275 if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) { 276 // Fails FCD check. Find the next FCD boundary and normalize. 277 do { 278 q = p; 279 if(p == rawLimit) { break; } 280 c = Character.codePointAt(seq, p); 281 p += Character.charCount(c); 282 } while(nfcImpl.getFCD16(c) > 0xff); 283 normalize(pos, q); 284 pos = start; 285 break; 286 } 287 prevCC = fcd16 & 0xff; 288 if(p == rawLimit || prevCC == 0) { 289 // FCD boundary after the last character. 290 limit = segmentLimit = p; 291 break; 292 } 293 } 294 assert(pos != limit); 295 checkDir = 0; 296 } 297 298 /** 299 * Switches to backward checking. 300 * To be called when checkDir > 0 || (checkDir == 0 && pos == start). 301 * Returns with checkDir < 0 || (checkDir == 0 && pos != start). 302 */ 303 private void switchToBackward() { 304 assert((checkDir > 0 && seq == rawSeq) || (checkDir == 0 && pos == start)); 305 if(checkDir > 0) { 306 // Turn around from forward checking. 307 limit = segmentLimit = pos; 308 if(pos == segmentStart) { 309 start = rawStart; 310 checkDir = -1; // Check backward. 311 } else { // pos > segmentStart 312 checkDir = 0; // Stay in FCD segment. 313 } 314 } else { 315 // Reached the start of the FCD segment. 316 if(seq == rawSeq) { 317 // The input text segment is FCD, extend it backward. 318 } else { 319 // The input text segment needed to be normalized. 320 // Switch to checking backward from it. 321 seq = rawSeq; 322 pos = limit = segmentLimit = segmentStart; 323 } 324 start = rawStart; 325 checkDir = -1; 326 } 327 } 328 329 /** 330 * Extend the FCD text segment backward or normalize around pos. 331 * To be called when checkDir < 0 && pos != start. 332 * Returns with checkDir == 0 and pos != start. 333 */ 334 private void previousSegment() { 335 assert(checkDir < 0 && seq == rawSeq && pos != start); 336 // The input text [pos..segmentLimit[ passes the FCD check. 337 int p = pos; 338 int nextCC = 0; 339 for(;;) { 340 // Fetch the previous character's fcd16 value. 341 int q = p; 342 int c = Character.codePointBefore(seq, p); 343 p -= Character.charCount(c); 344 int fcd16 = nfcImpl.getFCD16(c); 345 int trailCC = fcd16 & 0xff; 346 if(trailCC == 0 && q != pos) { 347 // FCD boundary after the [p, q[ character. 348 start = segmentStart = q; 349 break; 350 } 351 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || 352 CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) { 353 // Fails FCD check. Find the previous FCD boundary and normalize. 354 do { 355 q = p; 356 if(fcd16 <= 0xff || p == rawStart) { break; } 357 c = Character.codePointBefore(seq, p); 358 p -= Character.charCount(c); 359 } while((fcd16 = nfcImpl.getFCD16(c)) != 0); 360 normalize(q, pos); 361 pos = limit; 362 break; 363 } 364 nextCC = fcd16 >> 8; 365 if(p == rawStart || nextCC == 0) { 366 // FCD boundary before the following character. 367 start = segmentStart = p; 368 break; 369 } 370 } 371 assert(pos != start); 372 checkDir = 0; 373 } 374 375 private void normalize(int from, int to) { 376 if(normalized == null) { 377 normalized = new StringBuilder(); 378 } 379 // NFD without argument checking. 380 nfcImpl.decompose(rawSeq, from, to, normalized, to - from); 381 // Switch collation processing into the FCD buffer 382 // with the result of normalizing [segmentStart, segmentLimit[. 383 segmentStart = from; 384 segmentLimit = to; 385 seq = normalized; 386 start = 0; 387 limit = start + normalized.length(); 388 } 389 390 // Text pointers: The input text is rawSeq[rawStart, rawLimit[. 391 // (In C++, these are const UChar * pointers. 392 // In Java, we use CharSequence rawSeq and the parent class' seq 393 // together with int indexes.) 394 // 395 // checkDir > 0: 396 // 397 // The input text rawSeq[segmentStart..pos[ passes the FCD check. 398 // Moving forward checks incrementally. 399 // segmentLimit is undefined. seq == rawSeq. limit == rawLimit. 400 // 401 // checkDir < 0: 402 // The input text rawSeq[pos..segmentLimit[ passes the FCD check. 403 // Moving backward checks incrementally. 404 // segmentStart is undefined. seq == rawSeq. start == rawStart. 405 // 406 // checkDir == 0: 407 // 408 // The input text rawSeq[segmentStart..segmentLimit[ is being processed. 409 // These pointers are at FCD boundaries. 410 // Either this text segment already passes the FCD check 411 // and seq==rawSeq && segmentStart==start<=pos<=limit==segmentLimit, 412 // or the current segment had to be normalized so that 413 // rawSeq[segmentStart..segmentLimit[ turned into the normalized string, 414 // corresponding to seq==normalized && 0==start<=pos<=limit==start+normalized.length(). 415 private CharSequence rawSeq; 416 private static final int rawStart = 0; 417 private int segmentStart; 418 private int segmentLimit; 419 private int rawLimit; 420 421 private final Normalizer2Impl nfcImpl; 422 private StringBuilder normalized; 423 // Direction of incremental FCD check. See comments before rawStart. 424 private int checkDir; 425 } 426