1 /*
2 *******************************************************************************
3 * Copyright (C) 2014-2015, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 *******************************************************************************
6 */
7 
8 #include "unicode/utypes.h"
9 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
10 
11 #include "cmemory.h"
12 
13 #include "unicode/filteredbrk.h"
14 #include "unicode/ucharstriebuilder.h"
15 #include "unicode/ures.h"
16 
17 #include "uresimp.h" // ures_getByKeyWithFallback
18 #include "ubrkimpl.h" // U_ICUDATA_BRKITR
19 #include "uvector.h"
20 #include "cmemory.h"
21 
22 U_NAMESPACE_BEGIN
23 
24 #ifndef FB_DEBUG
25 #define FB_DEBUG 0
26 #endif
27 
28 #if FB_DEBUG
29 #include <stdio.h>
_fb_trace(const char * m,const UnicodeString * s,UBool b,int32_t d,const char * f,int l)30 static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) {
31   char buf[2048];
32   if(s) {
33     s->extract(0,s->length(),buf,2048);
34   } else {
35     strcpy(buf,"NULL");
36   }
37   fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
38           f, l, m, buf, (const void*)s, b?'T':'F',(int)d);
39 }
40 
41 #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
42 #else
43 #define FB_TRACE(m,s,b,d)
44 #endif
45 
46 /**
47  * Used with sortedInsert()
48  */
compareUnicodeString(UElement t1,UElement t2)49 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
50     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
51     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
52     return a.compare(b);
53 }
54 
55 /**
56  * A UVector which implements a set of strings.
57  */
58 class U_COMMON_API UStringSet : public UVector {
59  public:
UStringSet(UErrorCode & status)60   UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject,
61                                            uhash_compareUnicodeString,
62                                            1,
63                                            status) {}
64   virtual ~UStringSet();
65   /**
66    * Is this UnicodeSet contained?
67    */
contains(const UnicodeString & s)68   inline UBool contains(const UnicodeString& s) {
69     return contains((void*) &s);
70   }
71   using UVector::contains;
72   /**
73    * Return the ith UnicodeString alias
74    */
getStringAt(int32_t i) const75   inline const UnicodeString* getStringAt(int32_t i) const {
76     return (const UnicodeString*)elementAt(i);
77   }
78   /**
79    * Adopt the UnicodeString if not already contained.
80    * Caller no longer owns the pointer in any case.
81    * @return true if adopted successfully, false otherwise (error, or else duplicate)
82    */
adopt(UnicodeString * str,UErrorCode & status)83   inline UBool adopt(UnicodeString *str, UErrorCode &status) {
84     if(U_FAILURE(status) || contains(*str)) {
85       delete str;
86       return false;
87     } else {
88       sortedInsert(str, compareUnicodeString, status);
89       if(U_FAILURE(status)) {
90         delete str;
91         return false;
92       }
93       return true;
94     }
95   }
96   /**
97    * Add by value.
98    * @return true if successfully adopted.
99    */
add(const UnicodeString & str,UErrorCode & status)100   inline UBool add(const UnicodeString& str, UErrorCode &status) {
101     if(U_FAILURE(status)) return false;
102     UnicodeString *t = new UnicodeString(str);
103     if(t==NULL) {
104       status = U_MEMORY_ALLOCATION_ERROR; return false;
105     }
106     return adopt(t, status);
107   }
108   /**
109    * Remove this string.
110    * @return true if successfully removed, false otherwise (error, or else it wasn't there)
111    */
remove(const UnicodeString & s,UErrorCode & status)112   inline UBool remove(const UnicodeString &s, UErrorCode &status) {
113     if(U_FAILURE(status)) return false;
114     return removeElement((void*) &s);
115   }
116 };
117 
118 /**
119  * Virtual, won't be inlined
120  */
~UStringSet()121 UStringSet::~UStringSet() {}
122 
123 /* ----------------------------------------------------------- */
124 
125 
126 /* Filtered Break constants */
127 static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
128 static const int32_t kMATCH   = (1<<1); //< exact match - skip this one.
129 static const int32_t kSuppressInReverse = (1<<0);
130 static const int32_t kAddToForward = (1<<1);
131 static const UChar   kFULLSTOP = 0x002E; // '.'
132 
133 /**
134  * Shared data for SimpleFilteredSentenceBreakIterator
135  */
136 class SimpleFilteredSentenceBreakData : public UMemory {
137 public:
SimpleFilteredSentenceBreakData(UCharsTrie * forwards,UCharsTrie * backwards)138   SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
139       : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
incr()140   SimpleFilteredSentenceBreakData *incr() { refcount++;  return this; }
decr()141   SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; }
142   virtual ~SimpleFilteredSentenceBreakData();
143 
144   LocalPointer<UCharsTrie>    fForwardsPartialTrie; //  Has ".a" for "a.M."
145   LocalPointer<UCharsTrie>    fBackwardsTrie; //  i.e. ".srM" for Mrs.
146   int32_t                     refcount;
147 };
148 
~SimpleFilteredSentenceBreakData()149 SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
150 
151 /**
152  * Concrete implementation
153  */
154 class SimpleFilteredSentenceBreakIterator : public BreakIterator {
155 public:
156   SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
157   SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
158   virtual ~SimpleFilteredSentenceBreakIterator();
159 private:
160   SimpleFilteredSentenceBreakData *fData;
161   LocalPointer<BreakIterator> fDelegate;
162   LocalUTextPointer           fText;
163 
164   /* -- subclass interface -- */
165 public:
166   /* -- cloning and other subclass stuff -- */
createBufferClone(void *,int32_t &,UErrorCode & status)167   virtual BreakIterator *  createBufferClone(void * /*stackBuffer*/,
168                                              int32_t &/*BufferSize*/,
169                                              UErrorCode &status) {
170     // for now - always deep clone
171     status = U_SAFECLONE_ALLOCATED_WARNING;
172     return clone();
173   }
clone(void) const174   virtual BreakIterator* clone(void) const { return new SimpleFilteredSentenceBreakIterator(*this); }
getDynamicClassID(void) const175   virtual UClassID getDynamicClassID(void) const { return NULL; }
operator ==(const BreakIterator & o) const176   virtual UBool operator==(const BreakIterator& o) const { if(this==&o) return true; return false; }
177 
178   /* -- text modifying -- */
setText(UText * text,UErrorCode & status)179   virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); }
refreshInputText(UText * input,UErrorCode & status)180   virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; }
adoptText(CharacterIterator * it)181   virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); }
setText(const UnicodeString & text)182   virtual void setText(const UnicodeString &text) { fDelegate->setText(text); }
183 
184   /* -- other functions that are just delegated -- */
getUText(UText * fillIn,UErrorCode & status) const185   virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); }
getText(void) const186   virtual CharacterIterator& getText(void) const { return fDelegate->getText(); }
187 
188   /* -- ITERATION -- */
189   virtual int32_t first(void);
190   virtual int32_t preceding(int32_t offset);
191   virtual int32_t previous(void);
192   virtual UBool isBoundary(int32_t offset);
current(void) const193   virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct.
194 
195   virtual int32_t next(void);
196 
197   virtual int32_t next(int32_t n);
198   virtual int32_t following(int32_t offset);
199   virtual int32_t last(void);
200 
201 private:
202     /**
203      * Given that the fDelegate has already given its "initial" answer,
204      * find the NEXT actual (non-excepted) break.
205      * @param n initial position from delegate
206      * @return new break position or UBRK_DONE
207      */
208     int32_t internalNext(int32_t n);
209     /**
210      * Given that the fDelegate has already given its "initial" answer,
211      * find the PREV actual (non-excepted) break.
212      * @param n initial position from delegate
213      * @return new break position or UBRK_DONE
214      */
215     int32_t internalPrev(int32_t n);
216     /**
217      * set up the UText with the value of the fDelegate.
218      * Call this before calling breakExceptionAt.
219      * May be able to avoid excess calls
220      */
221     void resetState(UErrorCode &status);
222     /**
223      * Is there a match  (exception) at this spot?
224      */
225     enum EFBMatchResult { kNoExceptionHere, kExceptionHere };
226     /**
227      * Determine if there is an exception at this spot
228      * @param n spot to check
229      * @return kNoExceptionHere or kExceptionHere
230      **/
231     enum EFBMatchResult breakExceptionAt(int32_t n);
232 };
233 
SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator & other)234 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
235   : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone())
236 {
237 }
238 
239 
SimpleFilteredSentenceBreakIterator(BreakIterator * adopt,UCharsTrie * forwards,UCharsTrie * backwards,UErrorCode & status)240 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
241   BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
242   fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
243   fDelegate(adopt)
244 {
245   // all set..
246 }
247 
~SimpleFilteredSentenceBreakIterator()248 SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
249     fData = fData->decr();
250 }
251 
resetState(UErrorCode & status)252 void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) {
253   fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
254 }
255 
256 SimpleFilteredSentenceBreakIterator::EFBMatchResult
breakExceptionAt(int32_t n)257 SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
258     int64_t bestPosn = -1;
259     int32_t bestValue = -1;
260     // loops while 'n' points to an exception.
261     utext_setNativeIndex(fText.getAlias(), n); // from n..
262     fData->fBackwardsTrie->reset();
263     UChar32 uch;
264 
265     //if(debug2) u_printf(" n@ %d\n", n);
266     // Assume a space is following the '.'  (so we handle the case:  "Mr. /Brown")
267     if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) {  // TODO: skip a class of chars here??
268       // TODO only do this the 1st time?
269       //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
270     } else {
271       //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
272       uch = utext_next32(fText.getAlias());
273       //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
274     }
275 
276     UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
277 
278     while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL  &&   // more to consume backwards and..
279           USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
280       if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
281         bestPosn = utext_getNativeIndex(fText.getAlias());
282         bestValue = fData->fBackwardsTrie->getValue();
283       }
284       //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
285     }
286 
287     if(USTRINGTRIE_MATCHES(r)) { // exact match?
288       //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
289       bestValue = fData->fBackwardsTrie->getValue();
290       bestPosn = utext_getNativeIndex(fText.getAlias());
291       //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
292     }
293 
294     if(bestPosn>=0) {
295       //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
296 
297       //if(USTRINGTRIE_MATCHES(r)) {  // matched - so, now what?
298       //int32_t bestValue = fBackwardsTrie->getValue();
299       ////if(debug2) u_printf("rev< /%C/ matched, skip..%d  bestValue=%d\n", (UChar)uch, r, bestValue);
300 
301       if(bestValue == kMATCH) { // exact match!
302         //if(debug2) u_printf(" exact backward match\n");
303         return kExceptionHere; // See if the next is another exception.
304       } else if(bestValue == kPARTIAL
305                 && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
306         //if(debug2) u_printf(" partial backward match\n");
307         // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
308         // to see if it matches something going forward.
309         fData->fForwardsPartialTrie->reset();
310         UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
311         utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
312         //if(debug2) u_printf("Retrying at %d\n", bestPosn);
313         while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
314               USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) {
315           //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
316         }
317         if(USTRINGTRIE_MATCHES(rfwd)) {
318           //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
319           // only full matches here, nothing to check
320           // skip the next:
321             return kExceptionHere;
322         } else {
323           //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
324           // no match (no exception) -return the 'underlying' break
325           return kNoExceptionHere;
326         }
327       } else {
328         return kNoExceptionHere; // internal error and/or no forwards trie
329       }
330     } else {
331       //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r);  // no best match
332       return kNoExceptionHere; // No match - so exit. Not an exception.
333     }
334 }
335 
336 // the workhorse single next.
337 int32_t
internalNext(int32_t n)338 SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
339   if(n == UBRK_DONE || // at end  or
340     fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
341       return n;
342   }
343   // OK, do we need to break here?
344   UErrorCode status = U_ZERO_ERROR;
345   // refresh text
346   resetState(status);
347   if(U_FAILURE(status)) return UBRK_DONE; // bail out
348   int64_t utextLen = utext_nativeLength(fText.getAlias());
349 
350   //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
351   while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate).
352     SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
353 
354     switch(m) {
355     case kExceptionHere:
356       n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
357       continue;
358 
359     default:
360     case kNoExceptionHere:
361       return n;
362     }
363   }
364   return n;
365 }
366 
367 int32_t
internalPrev(int32_t n)368 SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
369   if(n == 0 || n == UBRK_DONE || // at end  or
370     fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
371       return n;
372   }
373   // OK, do we need to break here?
374   UErrorCode status = U_ZERO_ERROR;
375   // refresh text
376   resetState(status);
377   if(U_FAILURE(status)) return UBRK_DONE; // bail out
378 
379   //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
380   while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate).
381     SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
382 
383     switch(m) {
384     case kExceptionHere:
385       n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
386       continue;
387 
388     default:
389     case kNoExceptionHere:
390       return n;
391     }
392   }
393   return n;
394 }
395 
396 
397 int32_t
next()398 SimpleFilteredSentenceBreakIterator::next() {
399   return internalNext(fDelegate->next());
400 }
401 
402 int32_t
first(void)403 SimpleFilteredSentenceBreakIterator::first(void) {
404   return internalNext(fDelegate->first());
405 }
406 
407 int32_t
preceding(int32_t offset)408 SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
409   return internalPrev(fDelegate->preceding(offset));
410 }
411 
412 int32_t
previous(void)413 SimpleFilteredSentenceBreakIterator::previous(void) {
414   return internalPrev(fDelegate->previous());
415 }
416 
isBoundary(int32_t offset)417 UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
418   if(!fDelegate->isBoundary(offset)) return false; // no break to suppress
419 
420   UErrorCode status = U_ZERO_ERROR;
421   resetState(status);
422 
423   SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset);
424 
425   switch(m) {
426   case kExceptionHere:
427     return false;
428   default:
429   case kNoExceptionHere:
430     return true;
431   }
432 }
433 
434 int32_t
next(int32_t offset)435 SimpleFilteredSentenceBreakIterator::next(int32_t offset) {
436   return internalNext(fDelegate->next(offset));
437 }
438 
439 int32_t
following(int32_t offset)440 SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
441   return internalNext(fDelegate->following(offset));
442 }
443 
444 int32_t
last(void)445 SimpleFilteredSentenceBreakIterator::last(void) {
446   // Don't suppress a break opportunity at the end of text.
447   return fDelegate->last();
448 }
449 
450 
451 /**
452  * Concrete implementation of builder class.
453  */
454 class U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
455 public:
456   virtual ~SimpleFilteredBreakIteratorBuilder();
457   SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
458   SimpleFilteredBreakIteratorBuilder(UErrorCode &status);
459   virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
460   virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
461   virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status);
462 private:
463   UStringSet fSet;
464 };
465 
~SimpleFilteredBreakIteratorBuilder()466 SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
467 {
468 }
469 
SimpleFilteredBreakIteratorBuilder(UErrorCode & status)470 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
471   : fSet(status)
472 {
473 }
474 
SimpleFilteredBreakIteratorBuilder(const Locale & fromLocale,UErrorCode & status)475 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
476   : fSet(status)
477 {
478   if(U_SUCCESS(status)) {
479     LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &status));
480     LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &status));
481     LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &status));
482     if(U_FAILURE(status)) return; // leaves the builder empty, if you try to use it.
483 
484     LocalUResourceBundlePointer strs;
485     UErrorCode subStatus = status;
486     do {
487       strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
488       if(strs.isValid() && U_SUCCESS(subStatus)) {
489         UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
490         suppressBreakAfter(str, status); // load the string
491       }
492     } while (strs.isValid() && U_SUCCESS(subStatus));
493     if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
494       status = subStatus;
495     }
496   }
497 }
498 
499 UBool
suppressBreakAfter(const UnicodeString & exception,UErrorCode & status)500 SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
501 {
502   UBool r = fSet.add(exception, status);
503   FB_TRACE("suppressBreakAfter",&exception,r,0);
504   return r;
505 }
506 
507 UBool
unsuppressBreakAfter(const UnicodeString & exception,UErrorCode & status)508 SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
509 {
510   UBool r = fSet.remove(exception, status);
511   FB_TRACE("unsuppressBreakAfter",&exception,r,0);
512   return r;
513 }
514 
515 /**
516  * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly.
517  * Work around this.
518  *
519  * Note: "new UnicodeString[subCount]" ends up calling global operator new
520  * on MSVC2012 for some reason.
521  */
newUnicodeStringArray(size_t count)522 static inline UnicodeString* newUnicodeStringArray(size_t count) {
523     return new UnicodeString[count ? count : 1];
524 }
525 
526 BreakIterator *
build(BreakIterator * adoptBreakIterator,UErrorCode & status)527 SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
528   LocalPointer<BreakIterator> adopt(adoptBreakIterator);
529 
530   LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status);
531   LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status);
532   if(U_FAILURE(status)) {
533     return NULL;
534   }
535 
536   int32_t revCount = 0;
537   int32_t fwdCount = 0;
538 
539   int32_t subCount = fSet.size();
540 
541   UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount);
542 
543   LocalArray<UnicodeString> ustrs(ustrs_ptr);
544 
545   LocalMemory<int> partials;
546   partials.allocateInsteadAndReset(subCount);
547 
548   LocalPointer<UCharsTrie>    backwardsTrie; //  i.e. ".srM" for Mrs.
549   LocalPointer<UCharsTrie>    forwardsPartialTrie; //  Has ".a" for "a.M."
550 
551   int n=0;
552   for ( int32_t i = 0;
553         i<fSet.size();
554         i++) {
555     const UnicodeString *abbr = fSet.getStringAt(i);
556     if(abbr) {
557       FB_TRACE("build",abbr,TRUE,i);
558       ustrs[n] = *abbr; // copy by value
559       FB_TRACE("ustrs[n]",&ustrs[n],TRUE,i);
560     } else {
561       FB_TRACE("build",abbr,FALSE,i);
562       status = U_MEMORY_ALLOCATION_ERROR;
563       return NULL;
564     }
565     partials[n] = 0; // default: not partial
566     n++;
567   }
568   // first pass - find partials.
569   for(int i=0;i<subCount;i++) {
570     int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
571     if(nn>-1 && (nn+1)!=ustrs[i].length()) {
572       FB_TRACE("partial",&ustrs[i],FALSE,i);
573       // is partial.
574       // is it unique?
575       int sameAs = -1;
576       for(int j=0;j<subCount;j++) {
577         if(j==i) continue;
578         if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
579           FB_TRACE("prefix",&ustrs[j],FALSE,nn+1);
580           //UBool otherIsPartial = ((nn+1)!=ustrs[j].length());  // true if ustrs[j] doesn't end at nn
581           if(partials[j]==0) { // hasn't been processed yet
582             partials[j] = kSuppressInReverse | kAddToForward;
583             FB_TRACE("suppressing",&ustrs[j],FALSE,j);
584           } else if(partials[j] & kSuppressInReverse) {
585             sameAs = j; // the other entry is already in the reverse table.
586           }
587         }
588       }
589       FB_TRACE("for partial same-",&ustrs[i],FALSE,sameAs);
590       FB_TRACE(" == partial #",&ustrs[i],FALSE,partials[i]);
591       UnicodeString prefix(ustrs[i], 0, nn+1);
592       if(sameAs == -1 && partials[i] == 0) {
593         // first one - add the prefix to the reverse table.
594         prefix.reverse();
595         builder->add(prefix, kPARTIAL, status);
596         revCount++;
597         FB_TRACE("Added partial",&prefix,FALSE, i);
598         FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
599         partials[i] = kSuppressInReverse | kAddToForward;
600       } else {
601         FB_TRACE("NOT adding partial",&prefix,FALSE, i);
602         FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
603       }
604     }
605   }
606   for(int i=0;i<subCount;i++) {
607     if(partials[i]==0) {
608       ustrs[i].reverse();
609       builder->add(ustrs[i], kMATCH, status);
610       revCount++;
611       FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i);
612     } else {
613       FB_TRACE("Adding fwd",&ustrs[i], FALSE, i);
614 
615       // an optimization would be to only add the portion after the '.'
616       // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
617       // instead of "Ph.D." since we already know the "Ph." part is a match.
618       // would need the trie to be able to hold 0-length strings, though.
619       builder2->add(ustrs[i], kMATCH, status); // forward
620       fwdCount++;
621       //ustrs[i].reverse();
622       ////if(debug2) u_printf("SUPPRESS- not Added(%d):  /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
623     }
624   }
625   FB_TRACE("AbbrCount",NULL,FALSE, subCount);
626 
627   if(revCount>0) {
628     backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
629     if(U_FAILURE(status)) {
630       FB_TRACE(u_errorName(status),NULL,FALSE, -1);
631       return NULL;
632     }
633   }
634 
635   if(fwdCount>0) {
636     forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
637     if(U_FAILURE(status)) {
638       FB_TRACE(u_errorName(status),NULL,FALSE, -1);
639       return NULL;
640     }
641   }
642 
643   return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
644 }
645 
646 
647 // ----------- Base class implementation
648 
FilteredBreakIteratorBuilder()649 FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
650 }
651 
~FilteredBreakIteratorBuilder()652 FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
653 }
654 
655 FilteredBreakIteratorBuilder *
createInstance(const Locale & where,UErrorCode & status)656 FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
657   if(U_FAILURE(status)) return NULL;
658   LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status);
659   return (U_SUCCESS(status))? ret.orphan(): NULL;
660 }
661 
662 FilteredBreakIteratorBuilder *
createInstance(UErrorCode & status)663 FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) {
664   if(U_FAILURE(status)) return NULL;
665   LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
666   return (U_SUCCESS(status))? ret.orphan(): NULL;
667 }
668 
669 U_NAMESPACE_END
670 
671 #endif //#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION
672