1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2012-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * FCDIterCollationIterator.java, ported from uitercollationiterator.h/.cpp 9 * 10 * C++ version created on: 2012sep23 (from utf16collationiterator.h) 11 * created by: Markus W. Scherer 12 */ 13 14 package com.ibm.icu.impl.coll; 15 16 import com.ibm.icu.impl.Normalizer2Impl; 17 import com.ibm.icu.text.UCharacterIterator; 18 19 /** 20 * Incrementally checks the input text for FCD and normalizes where necessary. 21 */ 22 public final class FCDIterCollationIterator extends IterCollationIterator { FCDIterCollationIterator(CollationData data, boolean numeric, UCharacterIterator ui, int startIndex)23 public FCDIterCollationIterator(CollationData data, boolean numeric, 24 UCharacterIterator ui, int startIndex) { 25 super(data, numeric, ui); 26 state = State.ITER_CHECK_FWD; 27 start = startIndex; 28 nfcImpl = data.nfcImpl; 29 } 30 31 @Override resetToOffset(int newOffset)32 public void resetToOffset(int newOffset) { 33 super.resetToOffset(newOffset); 34 start = newOffset; 35 state = State.ITER_CHECK_FWD; 36 } 37 38 @Override getOffset()39 public int getOffset() { 40 if(state.compareTo(State.ITER_CHECK_BWD) <= 0) { 41 return iter.getIndex(); 42 } else if(state == State.ITER_IN_FCD_SEGMENT) { 43 return pos; 44 } else if(pos == 0) { 45 return start; 46 } else { 47 return limit; 48 } 49 } 50 51 @Override nextCodePoint()52 public int nextCodePoint() { 53 int c; 54 for(;;) { 55 if(state == State.ITER_CHECK_FWD) { 56 c = iter.next(); 57 if(c < 0) { 58 return c; 59 } 60 if(CollationFCD.hasTccc(c)) { 61 if(CollationFCD.maybeTibetanCompositeVowel(c) || 62 CollationFCD.hasLccc(iter.current())) { 63 iter.previous(); 64 if(!nextSegment()) { 65 return Collation.SENTINEL_CP; 66 } 67 continue; 68 } 69 } 70 if(isLeadSurrogate(c)) { 71 int trail = iter.next(); 72 if(isTrailSurrogate(trail)) { 73 return Character.toCodePoint((char)c, (char)trail); 74 } else if(trail >= 0) { 75 iter.previous(); 76 } 77 } 78 return c; 79 } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) { 80 c = iter.nextCodePoint(); 81 pos += Character.charCount(c); 82 assert(c >= 0); 83 return c; 84 } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && 85 pos != normalized.length()) { 86 c = normalized.codePointAt(pos); 87 pos += Character.charCount(c); 88 return c; 89 } else { 90 switchToForward(); 91 } 92 } 93 } 94 95 @Override previousCodePoint()96 public int previousCodePoint() { 97 int c; 98 for(;;) { 99 if(state == State.ITER_CHECK_BWD) { 100 c = iter.previous(); 101 if(c < 0) { 102 start = pos = 0; 103 state = State.ITER_IN_FCD_SEGMENT; 104 return Collation.SENTINEL_CP; 105 } 106 if(CollationFCD.hasLccc(c)) { 107 int prev = Collation.SENTINEL_CP; 108 if(CollationFCD.maybeTibetanCompositeVowel(c) || 109 CollationFCD.hasTccc(prev = iter.previous())) { 110 iter.next(); 111 if(prev >= 0) { 112 iter.next(); 113 } 114 if(!previousSegment()) { 115 return Collation.SENTINEL_CP; 116 } 117 continue; 118 } 119 // hasLccc(trail)=true for all trail surrogates 120 if(isTrailSurrogate(c)) { 121 if(prev < 0) { 122 prev = iter.previous(); 123 } 124 if(isLeadSurrogate(prev)) { 125 return Character.toCodePoint((char)prev, (char)c); 126 } 127 } 128 if(prev >= 0) { 129 iter.next(); 130 } 131 } 132 return c; 133 } else if(state == State.ITER_IN_FCD_SEGMENT && pos != start) { 134 c = iter.previousCodePoint(); 135 pos -= Character.charCount(c); 136 assert(c >= 0); 137 return c; 138 } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos != 0) { 139 c = normalized.codePointBefore(pos); 140 pos -= Character.charCount(c); 141 return c; 142 } else { 143 switchToBackward(); 144 } 145 } 146 } 147 148 @Override handleNextCE32()149 protected long handleNextCE32() { 150 int c; 151 for(;;) { 152 if(state == State.ITER_CHECK_FWD) { 153 c = iter.next(); 154 if(c < 0) { 155 return NO_CP_AND_CE32; 156 } 157 if(CollationFCD.hasTccc(c)) { 158 if(CollationFCD.maybeTibetanCompositeVowel(c) || 159 CollationFCD.hasLccc(iter.current())) { 160 iter.previous(); 161 if(!nextSegment()) { 162 c = Collation.SENTINEL_CP; 163 return Collation.FALLBACK_CE32; 164 } 165 continue; 166 } 167 } 168 break; 169 } else if(state == State.ITER_IN_FCD_SEGMENT && pos != limit) { 170 c = iter.next(); 171 ++pos; 172 assert(c >= 0); 173 break; 174 } else if(state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && 175 pos != normalized.length()) { 176 c = normalized.charAt(pos++); 177 break; 178 } else { 179 switchToForward(); 180 } 181 } 182 return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead((char)c)); 183 } 184 185 @Override handleGetTrailSurrogate()186 protected char handleGetTrailSurrogate() { 187 if(state.compareTo(State.ITER_IN_FCD_SEGMENT) <= 0) { 188 int trail = iter.next(); 189 if(isTrailSurrogate(trail)) { 190 if(state == State.ITER_IN_FCD_SEGMENT) { ++pos; } 191 } else if(trail >= 0) { 192 iter.previous(); 193 } 194 return (char)trail; 195 } else { 196 assert(pos < normalized.length()); 197 char trail; 198 if(Character.isLowSurrogate(trail = normalized.charAt(pos))) { ++pos; } 199 return trail; 200 } 201 } 202 203 @Override 204 protected void forwardNumCodePoints(int num) { 205 // Specify the class to avoid a virtual-function indirection. 206 // In Java, we would declare this class final. 207 while(num > 0 && nextCodePoint() >= 0) { 208 --num; 209 } 210 } 211 212 @Override backwardNumCodePoints(int num)213 protected void backwardNumCodePoints(int num) { 214 // Specify the class to avoid a virtual-function indirection. 215 // In Java, we would declare this class final. 216 while(num > 0 && previousCodePoint() >= 0) { 217 --num; 218 } 219 } 220 221 /** 222 * Switches to forward checking if possible. 223 */ switchToForward()224 private void switchToForward() { 225 assert(state == State.ITER_CHECK_BWD || 226 (state == State.ITER_IN_FCD_SEGMENT && pos == limit) || 227 (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == normalized.length())); 228 if(state == State.ITER_CHECK_BWD) { 229 // Turn around from backward checking. 230 start = pos = iter.getIndex(); 231 if(pos == limit) { 232 state = State.ITER_CHECK_FWD; // Check forward. 233 } else { // pos < limit 234 state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment. 235 } 236 } else { 237 // Reached the end of the FCD segment. 238 if(state == State.ITER_IN_FCD_SEGMENT) { 239 // The input text segment is FCD, extend it forward. 240 } else { 241 // The input text segment needed to be normalized. 242 // Switch to checking forward from it. 243 if(state == State.IN_NORM_ITER_AT_START) { 244 iter.moveIndex(limit - start); 245 } 246 start = limit; 247 } 248 state = State.ITER_CHECK_FWD; 249 } 250 } 251 252 /** 253 * Extends the FCD text segment forward or normalizes around pos. 254 * @return true if success 255 */ nextSegment()256 private boolean nextSegment() { 257 assert(state == State.ITER_CHECK_FWD); 258 // The input text [start..(iter index)[ passes the FCD check. 259 pos = iter.getIndex(); 260 // Collect the characters being checked, in case they need to be normalized. 261 if(s == null) { 262 s = new StringBuilder(); 263 } else { 264 s.setLength(0); 265 } 266 int prevCC = 0; 267 for(;;) { 268 // Fetch the next character and its fcd16 value. 269 int c = iter.nextCodePoint(); 270 if(c < 0) { break; } 271 int fcd16 = nfcImpl.getFCD16(c); 272 int leadCC = fcd16 >> 8; 273 if(leadCC == 0 && s.length() != 0) { 274 // FCD boundary before this character. 275 iter.previousCodePoint(); 276 break; 277 } 278 s.appendCodePoint(c); 279 if(leadCC != 0 && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) { 280 // Fails FCD check. Find the next FCD boundary and normalize. 281 for(;;) { 282 c = iter.nextCodePoint(); 283 if(c < 0) { break; } 284 if(nfcImpl.getFCD16(c) <= 0xff) { 285 iter.previousCodePoint(); 286 break; 287 } 288 s.appendCodePoint(c); 289 } 290 normalize(s); 291 start = pos; 292 limit = pos + s.length(); 293 state = State.IN_NORM_ITER_AT_LIMIT; 294 pos = 0; 295 return true; 296 } 297 prevCC = fcd16 & 0xff; 298 if(prevCC == 0) { 299 // FCD boundary after the last character. 300 break; 301 } 302 } 303 limit = pos + s.length(); 304 assert(pos != limit); 305 iter.moveIndex(-s.length()); 306 state = State.ITER_IN_FCD_SEGMENT; 307 return true; 308 } 309 310 /** 311 * Switches to backward checking. 312 */ switchToBackward()313 private void switchToBackward() { 314 assert(state == State.ITER_CHECK_FWD || 315 (state == State.ITER_IN_FCD_SEGMENT && pos == start) || 316 (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == 0)); 317 if(state == State.ITER_CHECK_FWD) { 318 // Turn around from forward checking. 319 limit = pos = iter.getIndex(); 320 if(pos == start) { 321 state = State.ITER_CHECK_BWD; // Check backward. 322 } else { // pos > start 323 state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment. 324 } 325 } else { 326 // Reached the start of the FCD segment. 327 if(state == State.ITER_IN_FCD_SEGMENT) { 328 // The input text segment is FCD, extend it backward. 329 } else { 330 // The input text segment needed to be normalized. 331 // Switch to checking backward from it. 332 if(state == State.IN_NORM_ITER_AT_LIMIT) { 333 iter.moveIndex(start - limit); 334 } 335 limit = start; 336 } 337 state = State.ITER_CHECK_BWD; 338 } 339 } 340 341 /** 342 * Extends the FCD text segment backward or normalizes around pos. 343 * @return true if success 344 */ previousSegment()345 private boolean previousSegment() { 346 assert(state == State.ITER_CHECK_BWD); 347 // The input text [(iter index)..limit[ passes the FCD check. 348 pos = iter.getIndex(); 349 // Collect the characters being checked, in case they need to be normalized. 350 if(s == null) { 351 s = new StringBuilder(); 352 } else { 353 s.setLength(0); 354 } 355 int nextCC = 0; 356 for(;;) { 357 // Fetch the previous character and its fcd16 value. 358 int c = iter.previousCodePoint(); 359 if(c < 0) { break; } 360 int fcd16 = nfcImpl.getFCD16(c); 361 int trailCC = fcd16 & 0xff; 362 if(trailCC == 0 && s.length() != 0) { 363 // FCD boundary after this character. 364 iter.nextCodePoint(); 365 break; 366 } 367 s.appendCodePoint(c); 368 if(trailCC != 0 && ((nextCC != 0 && trailCC > nextCC) || 369 CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) { 370 // Fails FCD check. Find the previous FCD boundary and normalize. 371 while(fcd16 > 0xff) { 372 c = iter.previousCodePoint(); 373 if(c < 0) { break; } 374 fcd16 = nfcImpl.getFCD16(c); 375 if(fcd16 == 0) { 376 iter.nextCodePoint(); 377 break; 378 } 379 s.appendCodePoint(c); 380 } 381 s.reverse(); 382 normalize(s); 383 limit = pos; 384 start = pos - s.length(); 385 state = State.IN_NORM_ITER_AT_START; 386 pos = normalized.length(); 387 return true; 388 } 389 nextCC = fcd16 >> 8; 390 if(nextCC == 0) { 391 // FCD boundary before the following character. 392 break; 393 } 394 } 395 start = pos - s.length(); 396 assert(pos != start); 397 iter.moveIndex(s.length()); 398 state = State.ITER_IN_FCD_SEGMENT; 399 return true; 400 } 401 normalize(CharSequence s)402 private void normalize(CharSequence s) { 403 if(normalized == null) { 404 normalized = new StringBuilder(); 405 } 406 // NFD without argument checking. 407 nfcImpl.decompose(s, normalized); 408 } 409 410 private enum State { 411 /** 412 * The input text [start..(iter index)[ passes the FCD check. 413 * Moving forward checks incrementally. 414 * pos & limit are undefined. 415 */ 416 ITER_CHECK_FWD, 417 /** 418 * The input text [(iter index)..limit[ passes the FCD check. 419 * Moving backward checks incrementally. 420 * start & pos are undefined. 421 */ 422 ITER_CHECK_BWD, 423 /** 424 * The input text [start..limit[ passes the FCD check. 425 * pos tracks the current text index. 426 */ 427 ITER_IN_FCD_SEGMENT, 428 /** 429 * The input text [start..limit[ failed the FCD check and was normalized. 430 * pos tracks the current index in the normalized string. 431 * The text iterator is at the limit index. 432 */ 433 IN_NORM_ITER_AT_LIMIT, 434 /** 435 * The input text [start..limit[ failed the FCD check and was normalized. 436 * pos tracks the current index in the normalized string. 437 * The text iterator is at the start index. 438 */ 439 IN_NORM_ITER_AT_START 440 } 441 442 private State state; 443 444 private int start; 445 private int pos; 446 private int limit; 447 448 private final Normalizer2Impl nfcImpl; 449 private StringBuilder s; 450 private StringBuilder normalized; 451 } 452