1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2014-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import java.text.CharacterIterator; 12 import java.util.HashSet; 13 import java.util.Locale; 14 15 import com.ibm.icu.impl.ICUResourceBundle.OpenType; 16 import com.ibm.icu.text.BreakIterator; 17 import com.ibm.icu.text.FilteredBreakIteratorBuilder; 18 import com.ibm.icu.text.UCharacterIterator; 19 import com.ibm.icu.util.BytesTrie; 20 import com.ibm.icu.util.CharsTrie; 21 import com.ibm.icu.util.CharsTrieBuilder; 22 import com.ibm.icu.util.StringTrieBuilder; 23 import com.ibm.icu.util.ULocale; 24 25 /** 26 * @author tomzhang 27 */ 28 public class SimpleFilteredSentenceBreakIterator extends BreakIterator { 29 30 private BreakIterator delegate; 31 private UCharacterIterator text; // TODO(Tom): suffice to move into the local scope in next() ? 32 private CharsTrie backwardsTrie; // i.e. ".srM" for Mrs. 33 private CharsTrie forwardsPartialTrie; // Has ".a" for "a.M." 34 35 /** 36 * @param adoptBreakIterator 37 * break iterator to adopt 38 * @param forwardsPartialTrie 39 * forward & partial char trie to adopt 40 * @param backwardsTrie 41 * backward trie to adopt 42 */ SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie, CharsTrie backwardsTrie)43 public SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie, 44 CharsTrie backwardsTrie) { 45 this.delegate = adoptBreakIterator; 46 this.forwardsPartialTrie = forwardsPartialTrie; 47 this.backwardsTrie = backwardsTrie; 48 } 49 50 51 /** 52 * Reset the filter from the delegate. 53 */ resetState()54 private final void resetState() { 55 text = UCharacterIterator.getInstance((CharacterIterator) delegate.getText().clone()); 56 } 57 58 /** 59 * Is there an exception at this point? 60 * 61 * @param n the location of the possible break 62 * @return 63 */ breakExceptionAt(int n)64 private final boolean breakExceptionAt(int n) { 65 // Note: the C++ version of this function is SimpleFilteredSentenceBreakIterator::breakExceptionAt() 66 67 int bestPosn = -1; 68 int bestValue = -1; 69 70 // loops while 'n' points to an exception 71 text.setIndex(n); 72 backwardsTrie.reset(); 73 int uch; 74 75 76 77 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") 78 if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here?? 79 // TODO only do this the 1st time? 80 } else { 81 uch = text.nextCodePoint(); 82 } 83 84 BytesTrie.Result r = BytesTrie.Result.INTERMEDIATE_VALUE; 85 86 while ((uch = text.previousCodePoint()) != UCharacterIterator.DONE && // more to consume backwards and.. 87 ((r = backwardsTrie.nextForCodePoint(uch)).hasNext())) {// more in the trie 88 if (r.hasValue()) { // remember the best match so far 89 bestPosn = text.getIndex(); 90 bestValue = backwardsTrie.getValue(); 91 } 92 } 93 94 if (r.matches()) { // exact match? 95 bestValue = backwardsTrie.getValue(); 96 bestPosn = text.getIndex(); 97 } 98 99 if (bestPosn >= 0) { 100 if (bestValue == Builder.MATCH) { // exact match! 101 return true; // Exception here. 102 } else if (bestValue == Builder.PARTIAL && forwardsPartialTrie != null) { 103 // make sure there's a forward trie 104 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie 105 // to see if it matches something going forward. 106 forwardsPartialTrie.reset(); 107 108 BytesTrie.Result rfwd = BytesTrie.Result.INTERMEDIATE_VALUE; 109 text.setIndex(bestPosn); // hope that's close .. 110 while ((uch = text.nextCodePoint()) != BreakIterator.DONE 111 && ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) { 112 } 113 if (rfwd.matches()) { 114 // Exception here 115 return true; 116 } // else fall through 117 } // else fall through 118 } // else fall through 119 return false; // No exception here. 120 } 121 122 /** 123 * Given that the delegate has already given its "initial" answer, 124 * find the NEXT actual (non-suppressed) break. 125 * @param n initial position from delegate 126 * @return new break position or BreakIterator.DONE 127 */ internalNext(int n)128 private final int internalNext(int n) { 129 if (n == BreakIterator.DONE || // at end or 130 backwardsTrie == null) { // .. no backwards table loaded == no exceptions 131 return n; 132 } 133 resetState(); 134 135 final int textLen = text.getLength(); 136 137 while (n != BreakIterator.DONE && n != textLen) { 138 // outer loop runs once per underlying break (from fDelegate). 139 // loops while 'n' points to an exception. 140 141 if (breakExceptionAt(n)) { 142 // n points to a break exception 143 n = delegate.next(); 144 } else { 145 // no exception at this spot 146 return n; 147 } 148 } 149 return n; //hit underlying DONE or break at end of text 150 } 151 152 /** 153 * Given that the delegate has already given its "initial" answer, 154 * find the PREV actual (non-suppressed) break. 155 * @param n initial position from delegate 156 * @return new break position or BreakIterator.DONE 157 */ internalPrev(int n)158 private final int internalPrev(int n) { 159 if (n == 0 || n == BreakIterator.DONE || // at end or 160 backwardsTrie == null) { // .. no backwards table loaded == no exceptions 161 return n; 162 } 163 resetState(); 164 165 while (n != BreakIterator.DONE && n != 0) { 166 // outer loop runs once per underlying break (from fDelegate). 167 // loops while 'n' points to an exception. 168 169 if (breakExceptionAt(n)) { 170 // n points to a break exception 171 n = delegate.previous(); 172 } else { 173 // no exception at this spot 174 return n; 175 } 176 } 177 return n; //hit underlying DONE or break at end of text 178 } 179 180 @Override equals(Object obj)181 public boolean equals(Object obj) { 182 if (obj == null) 183 return false; 184 if (this == obj) 185 return true; 186 if (getClass() != obj.getClass()) 187 return false; 188 SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) obj; 189 return delegate.equals(other.delegate) && text.equals(other.text) && backwardsTrie.equals(other.backwardsTrie) 190 && forwardsPartialTrie.equals(other.forwardsPartialTrie); 191 } 192 193 @Override hashCode()194 public int hashCode() { 195 return (forwardsPartialTrie.hashCode() * 39) + (backwardsTrie.hashCode() * 11) + delegate.hashCode(); 196 } 197 198 @Override clone()199 public Object clone() { 200 SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) super.clone(); 201 return other; 202 } 203 204 205 @Override first()206 public int first() { 207 // Don't suppress a break opportunity at the beginning of text. 208 return delegate.first(); 209 } 210 211 @Override preceding(int offset)212 public int preceding(int offset) { 213 return internalPrev(delegate.preceding(offset)); 214 } 215 216 @Override previous()217 public int previous() { 218 return internalPrev(delegate.previous()); 219 } 220 221 @Override current()222 public int current() { 223 return delegate.current(); 224 } 225 226 @Override isBoundary(int offset)227 public boolean isBoundary(int offset) { 228 if(!delegate.isBoundary(offset)) { 229 return false; // No underlying break to suppress? 230 } 231 232 // delegate thinks there's a break… 233 if(backwardsTrie == null) { 234 return true; // no data 235 } 236 237 resetState(); 238 return !breakExceptionAt(offset); // if there's an exception: no break. 239 } 240 241 @Override next()242 public int next() { 243 return internalNext(delegate.next()); 244 } 245 246 @Override next(int n)247 public int next(int n) { 248 return internalNext(delegate.next(n)); 249 } 250 251 @Override following(int offset)252 public int following(int offset) { 253 return internalNext(delegate.following(offset)); 254 } 255 256 @Override last()257 public int last() { 258 // Don't suppress a break opportunity at the end of text. 259 return delegate.last(); 260 } 261 262 @Override getText()263 public CharacterIterator getText() { 264 return delegate.getText(); 265 } 266 267 @Override setText(CharacterIterator newText)268 public void setText(CharacterIterator newText) { 269 delegate.setText(newText); 270 } 271 272 public static class Builder extends FilteredBreakIteratorBuilder { 273 /** 274 * filter set to store all exceptions 275 */ 276 private HashSet<CharSequence> filterSet = new HashSet<CharSequence>(); 277 278 static final int PARTIAL = (1 << 0); // < partial - need to run through forward trie 279 static final int MATCH = (1 << 1); // < exact match - skip this one. 280 static final int SuppressInReverse = (1 << 0); 281 static final int AddToForward = (1 << 1); 282 Builder(Locale loc)283 public Builder(Locale loc) { 284 this(ULocale.forLocale(loc)); 285 } 286 /** 287 * Create SimpleFilteredBreakIteratorBuilder using given locale 288 * @param loc the locale to get filtered iterators 289 */ Builder(ULocale loc)290 public Builder(ULocale loc) { 291 ICUResourceBundle rb = ICUResourceBundle.getBundleInstance( 292 ICUData.ICU_BRKITR_BASE_NAME, loc, OpenType.LOCALE_ROOT); 293 294 ICUResourceBundle breaks = rb.findWithFallback("exceptions/SentenceBreak"); 295 296 if (breaks != null) { 297 for (int index = 0, size = breaks.getSize(); index < size; ++index) { 298 ICUResourceBundle b = (ICUResourceBundle) breaks.get(index); 299 String br = b.getString(); 300 filterSet.add(br); 301 } 302 } 303 } 304 305 /** 306 * Create SimpleFilteredBreakIteratorBuilder with no exception 307 */ Builder()308 public Builder() { 309 } 310 311 @Override suppressBreakAfter(CharSequence str)312 public boolean suppressBreakAfter(CharSequence str) { 313 return filterSet.add(str); 314 } 315 316 @Override unsuppressBreakAfter(CharSequence str)317 public boolean unsuppressBreakAfter(CharSequence str) { 318 return filterSet.remove(str); 319 } 320 321 @Override wrapIteratorWithFilter(BreakIterator adoptBreakIterator)322 public BreakIterator wrapIteratorWithFilter(BreakIterator adoptBreakIterator) { 323 if( filterSet.isEmpty() ) { 324 // Short circuit - nothing to except. 325 return adoptBreakIterator; 326 } 327 328 CharsTrieBuilder builder = new CharsTrieBuilder(); 329 CharsTrieBuilder builder2 = new CharsTrieBuilder(); 330 331 int revCount = 0; 332 int fwdCount = 0; 333 334 int subCount = filterSet.size(); 335 CharSequence[] ustrs = new CharSequence[subCount]; 336 int[] partials = new int[subCount]; 337 338 CharsTrie backwardsTrie = null; // i.e. ".srM" for Mrs. 339 CharsTrie forwardsPartialTrie = null; // Has ".a" for "a.M." 340 341 int i = 0; 342 for (CharSequence s : filterSet) { 343 ustrs[i] = s; // copy by value? 344 partials[i] = 0; // default: no partial 345 i++; 346 } 347 348 for (i = 0; i < subCount; i++) { 349 String thisStr = ustrs[i].toString(); // TODO: don't cast to String? 350 int nn = thisStr.indexOf('.'); // TODO: non-'.' abbreviations 351 if (nn > -1 && (nn + 1) != thisStr.length()) { 352 // is partial. 353 // is it unique? 354 int sameAs = -1; 355 for (int j = 0; j < subCount; j++) { 356 if (j == i) 357 continue; 358 if (thisStr.regionMatches(0, ustrs[j].toString() /* TODO */, 0, nn + 1)) { 359 if (partials[j] == 0) { // hasn't been processed yet 360 partials[j] = SuppressInReverse | AddToForward; 361 } else if ((partials[j] & SuppressInReverse) != 0) { 362 sameAs = j; // the other entry is already in the reverse table. 363 } 364 } 365 } 366 367 if ((sameAs == -1) && (partials[i] == 0)) { 368 StringBuilder prefix = new StringBuilder(thisStr.substring(0, nn + 1)); 369 // first one - add the prefix to the reverse table. 370 prefix.reverse(); 371 builder.add(prefix, PARTIAL); 372 revCount++; 373 partials[i] = SuppressInReverse | AddToForward; 374 } 375 } 376 } 377 378 for (i = 0; i < subCount; i++) { 379 final String thisStr = ustrs[i].toString(); // TODO 380 if (partials[i] == 0) { 381 StringBuilder reversed = new StringBuilder(thisStr).reverse(); 382 builder.add(reversed, MATCH); 383 revCount++; 384 } else { 385 // an optimization would be to only add the portion after the '.' 386 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the 387 // forward, 388 // instead of "Ph.D." since we already know the "Ph." part is a match. 389 // would need the trie to be able to hold 0-length strings, though. 390 builder2.add(thisStr, MATCH); // forward 391 fwdCount++; 392 } 393 } 394 395 if (revCount > 0) { 396 backwardsTrie = builder.build(StringTrieBuilder.Option.FAST); 397 } 398 399 if (fwdCount > 0) { 400 forwardsPartialTrie = builder2.build(StringTrieBuilder.Option.FAST); 401 } 402 return new SimpleFilteredSentenceBreakIterator(adoptBreakIterator, forwardsPartialTrie, backwardsTrie); 403 } 404 } 405 } 406