1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 2014-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 package com.ibm.icu.impl;
10 
11 import java.text.CharacterIterator;
12 import java.util.HashSet;
13 import java.util.Locale;
14 
15 import com.ibm.icu.impl.ICUResourceBundle.OpenType;
16 import com.ibm.icu.text.BreakIterator;
17 import com.ibm.icu.text.FilteredBreakIteratorBuilder;
18 import com.ibm.icu.text.UCharacterIterator;
19 import com.ibm.icu.util.BytesTrie;
20 import com.ibm.icu.util.CharsTrie;
21 import com.ibm.icu.util.CharsTrieBuilder;
22 import com.ibm.icu.util.StringTrieBuilder;
23 import com.ibm.icu.util.ULocale;
24 
25 /**
26  * @author tomzhang
27  */
28 public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
29 
30     private BreakIterator delegate;
31     private UCharacterIterator text; // TODO(Tom): suffice to move into the local scope in next() ?
32     private CharsTrie backwardsTrie; // i.e. ".srM" for Mrs.
33     private CharsTrie forwardsPartialTrie; // Has ".a" for "a.M."
34 
35     /**
36      * @param adoptBreakIterator
37      *            break iterator to adopt
38      * @param forwardsPartialTrie
39      *            forward & partial char trie to adopt
40      * @param backwardsTrie
41      *            backward trie to adopt
42      */
SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie, CharsTrie backwardsTrie)43     public SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie,
44             CharsTrie backwardsTrie) {
45         this.delegate = adoptBreakIterator;
46         this.forwardsPartialTrie = forwardsPartialTrie;
47         this.backwardsTrie = backwardsTrie;
48     }
49 
50 
51     /**
52      * Reset the filter from the delegate.
53      */
resetState()54     private final void resetState() {
55         text = UCharacterIterator.getInstance((CharacterIterator) delegate.getText().clone());
56     }
57 
58     /**
59      * Is there an exception at this point?
60      *
61      * @param n the location of the possible break
62      * @return
63      */
breakExceptionAt(int n)64     private final boolean breakExceptionAt(int n) {
65         // Note: the C++ version of this function is SimpleFilteredSentenceBreakIterator::breakExceptionAt()
66 
67         int bestPosn = -1;
68         int bestValue = -1;
69 
70         // loops while 'n' points to an exception
71         text.setIndex(n);
72         backwardsTrie.reset();
73         int uch;
74 
75 
76 
77         // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
78         if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here??
79             // TODO only do this the 1st time?
80         } else {
81             uch = text.nextCodePoint();
82         }
83 
84         BytesTrie.Result r = BytesTrie.Result.INTERMEDIATE_VALUE;
85 
86         while ((uch = text.previousCodePoint()) != UCharacterIterator.DONE && // more to consume backwards and..
87                 ((r = backwardsTrie.nextForCodePoint(uch)).hasNext())) {// more in the trie
88             if (r.hasValue()) { // remember the best match so far
89                 bestPosn = text.getIndex();
90                 bestValue = backwardsTrie.getValue();
91             }
92         }
93 
94         if (r.matches()) { // exact match?
95             bestValue = backwardsTrie.getValue();
96             bestPosn = text.getIndex();
97         }
98 
99         if (bestPosn >= 0) {
100             if (bestValue == Builder.MATCH) { // exact match!
101                 return true; // Exception here.
102             } else if (bestValue == Builder.PARTIAL && forwardsPartialTrie != null) {
103                 // make sure there's a forward trie
104                 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
105                 // to see if it matches something going forward.
106                 forwardsPartialTrie.reset();
107 
108                 BytesTrie.Result rfwd = BytesTrie.Result.INTERMEDIATE_VALUE;
109                 text.setIndex(bestPosn); // hope that's close ..
110                 while ((uch = text.nextCodePoint()) != BreakIterator.DONE
111                         && ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) {
112                 }
113                 if (rfwd.matches()) {
114                     // Exception here
115                     return true;
116                 } // else fall through
117             } // else fall through
118         } // else fall through
119         return false; // No exception here.
120     }
121 
122     /**
123      * Given that the delegate has already given its "initial" answer,
124      * find the NEXT actual (non-suppressed) break.
125      * @param n initial position from delegate
126      * @return new break position or BreakIterator.DONE
127      */
internalNext(int n)128     private final int internalNext(int n) {
129         if (n == BreakIterator.DONE || // at end or
130                 backwardsTrie == null) { // .. no backwards table loaded == no exceptions
131             return n;
132         }
133         resetState();
134 
135         final int textLen = text.getLength();
136 
137         while (n != BreakIterator.DONE && n != textLen) {
138             // outer loop runs once per underlying break (from fDelegate).
139             // loops while 'n' points to an exception.
140 
141             if (breakExceptionAt(n)) {
142                 // n points to a break exception
143                 n = delegate.next();
144             } else {
145                 // no exception at this spot
146                 return n;
147             }
148         }
149         return n; //hit underlying DONE or break at end of text
150     }
151 
152     /**
153      * Given that the delegate has already given its "initial" answer,
154      * find the PREV actual (non-suppressed) break.
155      * @param n initial position from delegate
156      * @return new break position or BreakIterator.DONE
157      */
internalPrev(int n)158     private final int internalPrev(int n) {
159         if (n == 0 || n == BreakIterator.DONE || // at end or
160                 backwardsTrie == null) { // .. no backwards table loaded == no exceptions
161             return n;
162         }
163         resetState();
164 
165         while (n != BreakIterator.DONE && n != 0) {
166             // outer loop runs once per underlying break (from fDelegate).
167             // loops while 'n' points to an exception.
168 
169             if (breakExceptionAt(n)) {
170                 // n points to a break exception
171                 n = delegate.previous();
172             } else {
173                 // no exception at this spot
174                 return n;
175             }
176         }
177         return n; //hit underlying DONE or break at end of text
178     }
179 
180     @Override
equals(Object obj)181     public boolean equals(Object obj) {
182         if (obj == null)
183             return false;
184         if (this == obj)
185             return true;
186         if (getClass() != obj.getClass())
187             return false;
188         SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) obj;
189         return delegate.equals(other.delegate) && text.equals(other.text) && backwardsTrie.equals(other.backwardsTrie)
190                 && forwardsPartialTrie.equals(other.forwardsPartialTrie);
191     }
192 
193     @Override
hashCode()194     public int hashCode() {
195         return (forwardsPartialTrie.hashCode() * 39) + (backwardsTrie.hashCode() * 11) + delegate.hashCode();
196     }
197 
198     @Override
clone()199     public Object clone() {
200         SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) super.clone();
201         return other;
202     }
203 
204 
205     @Override
first()206     public int first() {
207         // Don't suppress a break opportunity at the beginning of text.
208         return delegate.first();
209     }
210 
211     @Override
preceding(int offset)212     public int preceding(int offset) {
213         return internalPrev(delegate.preceding(offset));
214     }
215 
216     @Override
previous()217     public int previous() {
218         return internalPrev(delegate.previous());
219     }
220 
221     @Override
current()222     public int current() {
223         return delegate.current();
224     }
225 
226     @Override
isBoundary(int offset)227     public boolean isBoundary(int offset) {
228         if(!delegate.isBoundary(offset)) {
229             return false; // No underlying break to suppress?
230         }
231 
232         // delegate thinks there's a break…
233         if(backwardsTrie == null) {
234             return true; // no data
235         }
236 
237         resetState();
238         return !breakExceptionAt(offset); // if there's an exception: no break.
239     }
240 
241     @Override
next()242     public int next() {
243         return internalNext(delegate.next());
244     }
245 
246     @Override
next(int n)247     public int next(int n) {
248         return internalNext(delegate.next(n));
249     }
250 
251     @Override
following(int offset)252     public int following(int offset) {
253         return internalNext(delegate.following(offset));
254     }
255 
256     @Override
last()257     public int last() {
258         // Don't suppress a break opportunity at the end of text.
259         return delegate.last();
260     }
261 
262     @Override
getText()263     public CharacterIterator getText() {
264         return delegate.getText();
265     }
266 
267     @Override
setText(CharacterIterator newText)268     public void setText(CharacterIterator newText) {
269         delegate.setText(newText);
270     }
271 
272     public static class Builder extends FilteredBreakIteratorBuilder {
273         /**
274          * filter set to store all exceptions
275          */
276         private HashSet<CharSequence> filterSet = new HashSet<CharSequence>();
277 
278         static final int PARTIAL = (1 << 0); // < partial - need to run through forward trie
279         static final int MATCH = (1 << 1); // < exact match - skip this one.
280         static final int SuppressInReverse = (1 << 0);
281         static final int AddToForward = (1 << 1);
282 
Builder(Locale loc)283         public Builder(Locale loc) {
284             this(ULocale.forLocale(loc));
285         }
286         /**
287          * Create SimpleFilteredBreakIteratorBuilder using given locale
288          * @param loc the locale to get filtered iterators
289          */
Builder(ULocale loc)290         public Builder(ULocale loc) {
291             ICUResourceBundle rb = ICUResourceBundle.getBundleInstance(
292                     ICUData.ICU_BRKITR_BASE_NAME, loc, OpenType.LOCALE_ROOT);
293 
294             ICUResourceBundle breaks = rb.findWithFallback("exceptions/SentenceBreak");
295 
296             if (breaks != null) {
297                 for (int index = 0, size = breaks.getSize(); index < size; ++index) {
298                     ICUResourceBundle b = (ICUResourceBundle) breaks.get(index);
299                     String br = b.getString();
300                     filterSet.add(br);
301                 }
302             }
303         }
304 
305         /**
306          * Create SimpleFilteredBreakIteratorBuilder with no exception
307          */
Builder()308         public Builder() {
309         }
310 
311         @Override
suppressBreakAfter(CharSequence str)312         public boolean suppressBreakAfter(CharSequence str) {
313             return filterSet.add(str);
314         }
315 
316         @Override
unsuppressBreakAfter(CharSequence str)317         public boolean unsuppressBreakAfter(CharSequence str) {
318             return filterSet.remove(str);
319         }
320 
321         @Override
wrapIteratorWithFilter(BreakIterator adoptBreakIterator)322         public BreakIterator wrapIteratorWithFilter(BreakIterator adoptBreakIterator) {
323             if( filterSet.isEmpty() ) {
324                 // Short circuit - nothing to except.
325                 return adoptBreakIterator;
326             }
327 
328             CharsTrieBuilder builder = new CharsTrieBuilder();
329             CharsTrieBuilder builder2 = new CharsTrieBuilder();
330 
331             int revCount = 0;
332             int fwdCount = 0;
333 
334             int subCount = filterSet.size();
335             CharSequence[] ustrs = new CharSequence[subCount];
336             int[] partials = new int[subCount];
337 
338             CharsTrie backwardsTrie = null; // i.e. ".srM" for Mrs.
339             CharsTrie forwardsPartialTrie = null; // Has ".a" for "a.M."
340 
341             int i = 0;
342             for (CharSequence s : filterSet) {
343                 ustrs[i] = s; // copy by value?
344                 partials[i] = 0; // default: no partial
345                 i++;
346             }
347 
348             for (i = 0; i < subCount; i++) {
349                 String thisStr = ustrs[i].toString(); // TODO: don't cast to String?
350                 int nn = thisStr.indexOf('.'); // TODO: non-'.' abbreviations
351                 if (nn > -1 && (nn + 1) != thisStr.length()) {
352                     // is partial.
353                     // is it unique?
354                     int sameAs = -1;
355                     for (int j = 0; j < subCount; j++) {
356                         if (j == i)
357                             continue;
358                         if (thisStr.regionMatches(0, ustrs[j].toString() /* TODO */, 0, nn + 1)) {
359                             if (partials[j] == 0) { // hasn't been processed yet
360                                 partials[j] = SuppressInReverse | AddToForward;
361                             } else if ((partials[j] & SuppressInReverse) != 0) {
362                                 sameAs = j; // the other entry is already in the reverse table.
363                             }
364                         }
365                     }
366 
367                     if ((sameAs == -1) && (partials[i] == 0)) {
368                         StringBuilder prefix = new StringBuilder(thisStr.substring(0, nn + 1));
369                         // first one - add the prefix to the reverse table.
370                         prefix.reverse();
371                         builder.add(prefix, PARTIAL);
372                         revCount++;
373                         partials[i] = SuppressInReverse | AddToForward;
374                     }
375                 }
376             }
377 
378             for (i = 0; i < subCount; i++) {
379                 final String thisStr = ustrs[i].toString(); // TODO
380                 if (partials[i] == 0) {
381                     StringBuilder reversed = new StringBuilder(thisStr).reverse();
382                     builder.add(reversed, MATCH);
383                     revCount++;
384                 } else {
385                     // an optimization would be to only add the portion after the '.'
386                     // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the
387                     // forward,
388                     // instead of "Ph.D." since we already know the "Ph." part is a match.
389                     // would need the trie to be able to hold 0-length strings, though.
390                     builder2.add(thisStr, MATCH); // forward
391                     fwdCount++;
392                 }
393             }
394 
395             if (revCount > 0) {
396                 backwardsTrie = builder.build(StringTrieBuilder.Option.FAST);
397             }
398 
399             if (fwdCount > 0) {
400                 forwardsPartialTrie = builder2.build(StringTrieBuilder.Option.FAST);
401             }
402             return new SimpleFilteredSentenceBreakIterator(adoptBreakIterator, forwardsPartialTrie, backwardsTrie);
403         }
404     }
405 }
406