1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2014-2015, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 */
9
10 #include "unicode/utypes.h"
11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
12
13 #include "cmemory.h"
14
15 #include "unicode/filteredbrk.h"
16 #include "unicode/ucharstriebuilder.h"
17 #include "unicode/ures.h"
18
19 #include "uresimp.h" // ures_getByKeyWithFallback
20 #include "ubrkimpl.h" // U_ICUDATA_BRKITR
21 #include "uvector.h"
22 #include "cmemory.h"
23
24 U_NAMESPACE_BEGIN
25
26 #ifndef FB_DEBUG
27 #define FB_DEBUG 0
28 #endif
29
30 #if FB_DEBUG
31 #include <stdio.h>
_fb_trace(const char * m,const UnicodeString * s,UBool b,int32_t d,const char * f,int l)32 static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) {
33 char buf[2048];
34 if(s) {
35 s->extract(0,s->length(),buf,2048);
36 } else {
37 strcpy(buf,"NULL");
38 }
39 fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
40 f, l, m, buf, (const void*)s, b?'T':'F',(int)d);
41 }
42
43 #define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
44 #else
45 #define FB_TRACE(m,s,b,d)
46 #endif
47
48 /**
49 * Used with sortedInsert()
50 */
compareUnicodeString(UElement t1,UElement t2)51 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
52 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
53 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
54 return a.compare(b);
55 }
56
57 /**
58 * A UVector which implements a set of strings.
59 */
60 class U_COMMON_API UStringSet : public UVector {
61 public:
UStringSet(UErrorCode & status)62 UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject,
63 uhash_compareUnicodeString,
64 1,
65 status) {}
66 virtual ~UStringSet();
67 /**
68 * Is this UnicodeSet contained?
69 */
contains(const UnicodeString & s)70 inline UBool contains(const UnicodeString& s) {
71 return contains((void*) &s);
72 }
73 using UVector::contains;
74 /**
75 * Return the ith UnicodeString alias
76 */
getStringAt(int32_t i) const77 inline const UnicodeString* getStringAt(int32_t i) const {
78 return (const UnicodeString*)elementAt(i);
79 }
80 /**
81 * Adopt the UnicodeString if not already contained.
82 * Caller no longer owns the pointer in any case.
83 * @return true if adopted successfully, false otherwise (error, or else duplicate)
84 */
adopt(UnicodeString * str,UErrorCode & status)85 inline UBool adopt(UnicodeString *str, UErrorCode &status) {
86 if(U_FAILURE(status) || contains(*str)) {
87 delete str;
88 return false;
89 } else {
90 sortedInsert(str, compareUnicodeString, status);
91 if(U_FAILURE(status)) {
92 delete str;
93 return false;
94 }
95 return true;
96 }
97 }
98 /**
99 * Add by value.
100 * @return true if successfully adopted.
101 */
add(const UnicodeString & str,UErrorCode & status)102 inline UBool add(const UnicodeString& str, UErrorCode &status) {
103 if(U_FAILURE(status)) return false;
104 UnicodeString *t = new UnicodeString(str);
105 if(t==NULL) {
106 status = U_MEMORY_ALLOCATION_ERROR; return false;
107 }
108 return adopt(t, status);
109 }
110 /**
111 * Remove this string.
112 * @return true if successfully removed, false otherwise (error, or else it wasn't there)
113 */
remove(const UnicodeString & s,UErrorCode & status)114 inline UBool remove(const UnicodeString &s, UErrorCode &status) {
115 if(U_FAILURE(status)) return false;
116 return removeElement((void*) &s);
117 }
118 };
119
120 /**
121 * Virtual, won't be inlined
122 */
~UStringSet()123 UStringSet::~UStringSet() {}
124
125 /* ----------------------------------------------------------- */
126
127
128 /* Filtered Break constants */
129 static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
130 static const int32_t kMATCH = (1<<1); //< exact match - skip this one.
131 static const int32_t kSuppressInReverse = (1<<0);
132 static const int32_t kAddToForward = (1<<1);
133 static const UChar kFULLSTOP = 0x002E; // '.'
134
135 /**
136 * Shared data for SimpleFilteredSentenceBreakIterator
137 */
138 class SimpleFilteredSentenceBreakData : public UMemory {
139 public:
SimpleFilteredSentenceBreakData(UCharsTrie * forwards,UCharsTrie * backwards)140 SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
141 : fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
incr()142 SimpleFilteredSentenceBreakData *incr() { refcount++; return this; }
decr()143 SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; }
144 virtual ~SimpleFilteredSentenceBreakData();
145
146 LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
147 LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
148 int32_t refcount;
149 };
150
~SimpleFilteredSentenceBreakData()151 SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
152
153 /**
154 * Concrete implementation
155 */
156 class SimpleFilteredSentenceBreakIterator : public BreakIterator {
157 public:
158 SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
159 SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
160 virtual ~SimpleFilteredSentenceBreakIterator();
161 private:
162 SimpleFilteredSentenceBreakData *fData;
163 LocalPointer<BreakIterator> fDelegate;
164 LocalUTextPointer fText;
165
166 /* -- subclass interface -- */
167 public:
168 /* -- cloning and other subclass stuff -- */
createBufferClone(void *,int32_t &,UErrorCode & status)169 virtual BreakIterator * createBufferClone(void * /*stackBuffer*/,
170 int32_t &/*BufferSize*/,
171 UErrorCode &status) {
172 // for now - always deep clone
173 status = U_SAFECLONE_ALLOCATED_WARNING;
174 return clone();
175 }
clone(void) const176 virtual BreakIterator* clone(void) const { return new SimpleFilteredSentenceBreakIterator(*this); }
getDynamicClassID(void) const177 virtual UClassID getDynamicClassID(void) const { return NULL; }
operator ==(const BreakIterator & o) const178 virtual UBool operator==(const BreakIterator& o) const { if(this==&o) return true; return false; }
179
180 /* -- text modifying -- */
setText(UText * text,UErrorCode & status)181 virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); }
refreshInputText(UText * input,UErrorCode & status)182 virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; }
adoptText(CharacterIterator * it)183 virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); }
setText(const UnicodeString & text)184 virtual void setText(const UnicodeString &text) { fDelegate->setText(text); }
185
186 /* -- other functions that are just delegated -- */
getUText(UText * fillIn,UErrorCode & status) const187 virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); }
getText(void) const188 virtual CharacterIterator& getText(void) const { return fDelegate->getText(); }
189
190 /* -- ITERATION -- */
191 virtual int32_t first(void);
192 virtual int32_t preceding(int32_t offset);
193 virtual int32_t previous(void);
194 virtual UBool isBoundary(int32_t offset);
current(void) const195 virtual int32_t current(void) const { return fDelegate->current(); } // we keep the delegate current, so this should be correct.
196
197 virtual int32_t next(void);
198
199 virtual int32_t next(int32_t n);
200 virtual int32_t following(int32_t offset);
201 virtual int32_t last(void);
202
203 private:
204 /**
205 * Given that the fDelegate has already given its "initial" answer,
206 * find the NEXT actual (non-excepted) break.
207 * @param n initial position from delegate
208 * @return new break position or UBRK_DONE
209 */
210 int32_t internalNext(int32_t n);
211 /**
212 * Given that the fDelegate has already given its "initial" answer,
213 * find the PREV actual (non-excepted) break.
214 * @param n initial position from delegate
215 * @return new break position or UBRK_DONE
216 */
217 int32_t internalPrev(int32_t n);
218 /**
219 * set up the UText with the value of the fDelegate.
220 * Call this before calling breakExceptionAt.
221 * May be able to avoid excess calls
222 */
223 void resetState(UErrorCode &status);
224 /**
225 * Is there a match (exception) at this spot?
226 */
227 enum EFBMatchResult { kNoExceptionHere, kExceptionHere };
228 /**
229 * Determine if there is an exception at this spot
230 * @param n spot to check
231 * @return kNoExceptionHere or kExceptionHere
232 **/
233 enum EFBMatchResult breakExceptionAt(int32_t n);
234 };
235
SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator & other)236 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
237 : BreakIterator(other), fData(other.fData->incr()), fDelegate(other.fDelegate->clone())
238 {
239 }
240
241
SimpleFilteredSentenceBreakIterator(BreakIterator * adopt,UCharsTrie * forwards,UCharsTrie * backwards,UErrorCode & status)242 SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
243 BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
244 fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
245 fDelegate(adopt)
246 {
247 // all set..
248 }
249
~SimpleFilteredSentenceBreakIterator()250 SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
251 fData = fData->decr();
252 }
253
resetState(UErrorCode & status)254 void SimpleFilteredSentenceBreakIterator::resetState(UErrorCode &status) {
255 fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
256 }
257
258 SimpleFilteredSentenceBreakIterator::EFBMatchResult
breakExceptionAt(int32_t n)259 SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
260 int64_t bestPosn = -1;
261 int32_t bestValue = -1;
262 // loops while 'n' points to an exception.
263 utext_setNativeIndex(fText.getAlias(), n); // from n..
264 fData->fBackwardsTrie->reset();
265 UChar32 uch;
266
267 //if(debug2) u_printf(" n@ %d\n", n);
268 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
269 if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here??
270 // TODO only do this the 1st time?
271 //if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
272 } else {
273 //if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
274 uch = utext_next32(fText.getAlias());
275 //if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
276 }
277
278 UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
279
280 while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and..
281 USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
282 if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
283 bestPosn = utext_getNativeIndex(fText.getAlias());
284 bestValue = fData->fBackwardsTrie->getValue();
285 }
286 //if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
287 }
288
289 if(USTRINGTRIE_MATCHES(r)) { // exact match?
290 //if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
291 bestValue = fData->fBackwardsTrie->getValue();
292 bestPosn = utext_getNativeIndex(fText.getAlias());
293 //if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
294 }
295
296 if(bestPosn>=0) {
297 //if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
298
299 //if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
300 //int32_t bestValue = fBackwardsTrie->getValue();
301 ////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue);
302
303 if(bestValue == kMATCH) { // exact match!
304 //if(debug2) u_printf(" exact backward match\n");
305 return kExceptionHere; // See if the next is another exception.
306 } else if(bestValue == kPARTIAL
307 && fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
308 //if(debug2) u_printf(" partial backward match\n");
309 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
310 // to see if it matches something going forward.
311 fData->fForwardsPartialTrie->reset();
312 UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
313 utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
314 //if(debug2) u_printf("Retrying at %d\n", bestPosn);
315 while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
316 USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) {
317 //if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
318 }
319 if(USTRINGTRIE_MATCHES(rfwd)) {
320 //if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
321 // only full matches here, nothing to check
322 // skip the next:
323 return kExceptionHere;
324 } else {
325 //if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
326 // no match (no exception) -return the 'underlying' break
327 return kNoExceptionHere;
328 }
329 } else {
330 return kNoExceptionHere; // internal error and/or no forwards trie
331 }
332 } else {
333 //if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match
334 return kNoExceptionHere; // No match - so exit. Not an exception.
335 }
336 }
337
338 // the workhorse single next.
339 int32_t
internalNext(int32_t n)340 SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
341 if(n == UBRK_DONE || // at end or
342 fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
343 return n;
344 }
345 // OK, do we need to break here?
346 UErrorCode status = U_ZERO_ERROR;
347 // refresh text
348 resetState(status);
349 if(U_FAILURE(status)) return UBRK_DONE; // bail out
350 int64_t utextLen = utext_nativeLength(fText.getAlias());
351
352 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
353 while (n != UBRK_DONE && n != utextLen) { // outer loop runs once per underlying break (from fDelegate).
354 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
355
356 switch(m) {
357 case kExceptionHere:
358 n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
359 continue;
360
361 default:
362 case kNoExceptionHere:
363 return n;
364 }
365 }
366 return n;
367 }
368
369 int32_t
internalPrev(int32_t n)370 SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
371 if(n == 0 || n == UBRK_DONE || // at end or
372 fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
373 return n;
374 }
375 // OK, do we need to break here?
376 UErrorCode status = U_ZERO_ERROR;
377 // refresh text
378 resetState(status);
379 if(U_FAILURE(status)) return UBRK_DONE; // bail out
380
381 //if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
382 while (n != UBRK_DONE && n != 0) { // outer loop runs once per underlying break (from fDelegate).
383 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(n);
384
385 switch(m) {
386 case kExceptionHere:
387 n = fDelegate->previous(); // skip this one. Find the next lowerlevel break.
388 continue;
389
390 default:
391 case kNoExceptionHere:
392 return n;
393 }
394 }
395 return n;
396 }
397
398
399 int32_t
next()400 SimpleFilteredSentenceBreakIterator::next() {
401 return internalNext(fDelegate->next());
402 }
403
404 int32_t
first(void)405 SimpleFilteredSentenceBreakIterator::first(void) {
406 // Don't suppress a break opportunity at the beginning of text.
407 return fDelegate->first();
408 }
409
410 int32_t
preceding(int32_t offset)411 SimpleFilteredSentenceBreakIterator::preceding(int32_t offset) {
412 return internalPrev(fDelegate->preceding(offset));
413 }
414
415 int32_t
previous(void)416 SimpleFilteredSentenceBreakIterator::previous(void) {
417 return internalPrev(fDelegate->previous());
418 }
419
isBoundary(int32_t offset)420 UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
421 if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
422
423 if (fData->fBackwardsTrie.isNull()) return true; // no data = no suppressions
424
425 UErrorCode status = U_ZERO_ERROR;
426 resetState(status);
427
428 SimpleFilteredSentenceBreakIterator::EFBMatchResult m = breakExceptionAt(offset);
429
430 switch(m) {
431 case kExceptionHere:
432 return false;
433 default:
434 case kNoExceptionHere:
435 return true;
436 }
437 }
438
439 int32_t
next(int32_t offset)440 SimpleFilteredSentenceBreakIterator::next(int32_t offset) {
441 return internalNext(fDelegate->next(offset));
442 }
443
444 int32_t
following(int32_t offset)445 SimpleFilteredSentenceBreakIterator::following(int32_t offset) {
446 return internalNext(fDelegate->following(offset));
447 }
448
449 int32_t
last(void)450 SimpleFilteredSentenceBreakIterator::last(void) {
451 // Don't suppress a break opportunity at the end of text.
452 return fDelegate->last();
453 }
454
455
456 /**
457 * Concrete implementation of builder class.
458 */
459 class U_COMMON_API SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
460 public:
461 virtual ~SimpleFilteredBreakIteratorBuilder();
462 SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
463 SimpleFilteredBreakIteratorBuilder(UErrorCode &status);
464 virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
465 virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
466 virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status);
467 private:
468 UStringSet fSet;
469 };
470
~SimpleFilteredBreakIteratorBuilder()471 SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
472 {
473 }
474
SimpleFilteredBreakIteratorBuilder(UErrorCode & status)475 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
476 : fSet(status)
477 {
478 }
479
SimpleFilteredBreakIteratorBuilder(const Locale & fromLocale,UErrorCode & status)480 SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
481 : fSet(status)
482 {
483 if(U_SUCCESS(status)) {
484 UErrorCode subStatus = U_ZERO_ERROR;
485 LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &subStatus));
486 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
487 status = subStatus; // copy the failing status
488 #if FB_DEBUG
489 fprintf(stderr, "open BUNDLE %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
490 #endif
491 return; // leaves the builder empty, if you try to use it.
492 }
493 LocalUResourceBundlePointer exceptions(ures_getByKeyWithFallback(b.getAlias(), "exceptions", NULL, &subStatus));
494 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
495 status = subStatus; // copy the failing status
496 #if FB_DEBUG
497 fprintf(stderr, "open EXCEPTIONS %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
498 #endif
499 return; // leaves the builder empty, if you try to use it.
500 }
501 LocalUResourceBundlePointer breaks(ures_getByKeyWithFallback(exceptions.getAlias(), "SentenceBreak", NULL, &subStatus));
502
503 #if FB_DEBUG
504 {
505 UErrorCode subsub = subStatus;
506 fprintf(stderr, "open SentenceBreak %s => %s, %s\n", fromLocale.getBaseName(), ures_getLocale(breaks.getAlias(), &subsub), u_errorName(subStatus));
507 }
508 #endif
509
510 if (U_FAILURE(subStatus) || (subStatus == U_USING_DEFAULT_WARNING) ) {
511 status = subStatus; // copy the failing status
512 #if FB_DEBUG
513 fprintf(stderr, "open %s : %s, %s\n", fromLocale.getBaseName(), "[exit]", u_errorName(status));
514 #endif
515 return; // leaves the builder empty, if you try to use it.
516 }
517
518 LocalUResourceBundlePointer strs;
519 subStatus = status; // Pick up inherited warning status now
520 do {
521 strs.adoptInstead(ures_getNextResource(breaks.getAlias(), strs.orphan(), &subStatus));
522 if(strs.isValid() && U_SUCCESS(subStatus)) {
523 UnicodeString str(ures_getUnicodeString(strs.getAlias(), &status));
524 suppressBreakAfter(str, status); // load the string
525 }
526 } while (strs.isValid() && U_SUCCESS(subStatus));
527 if(U_FAILURE(subStatus)&&subStatus!=U_INDEX_OUTOFBOUNDS_ERROR&&U_SUCCESS(status)) {
528 status = subStatus;
529 }
530 }
531 }
532
533 UBool
suppressBreakAfter(const UnicodeString & exception,UErrorCode & status)534 SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
535 {
536 UBool r = fSet.add(exception, status);
537 FB_TRACE("suppressBreakAfter",&exception,r,0);
538 return r;
539 }
540
541 UBool
unsuppressBreakAfter(const UnicodeString & exception,UErrorCode & status)542 SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
543 {
544 UBool r = fSet.remove(exception, status);
545 FB_TRACE("unsuppressBreakAfter",&exception,r,0);
546 return r;
547 }
548
549 /**
550 * Jitterbug 2974: MSVC has a bug whereby new X[0] behaves badly.
551 * Work around this.
552 *
553 * Note: "new UnicodeString[subCount]" ends up calling global operator new
554 * on MSVC2012 for some reason.
555 */
newUnicodeStringArray(size_t count)556 static inline UnicodeString* newUnicodeStringArray(size_t count) {
557 return new UnicodeString[count ? count : 1];
558 }
559
560 BreakIterator *
build(BreakIterator * adoptBreakIterator,UErrorCode & status)561 SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
562 LocalPointer<BreakIterator> adopt(adoptBreakIterator);
563
564 LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status), status);
565 LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status), status);
566 if(U_FAILURE(status)) {
567 return NULL;
568 }
569
570 int32_t revCount = 0;
571 int32_t fwdCount = 0;
572
573 int32_t subCount = fSet.size();
574
575 UnicodeString *ustrs_ptr = newUnicodeStringArray(subCount);
576
577 LocalArray<UnicodeString> ustrs(ustrs_ptr);
578
579 LocalMemory<int> partials;
580 partials.allocateInsteadAndReset(subCount);
581
582 LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs.
583 LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M."
584
585 int n=0;
586 for ( int32_t i = 0;
587 i<fSet.size();
588 i++) {
589 const UnicodeString *abbr = fSet.getStringAt(i);
590 if(abbr) {
591 FB_TRACE("build",abbr,TRUE,i);
592 ustrs[n] = *abbr; // copy by value
593 FB_TRACE("ustrs[n]",&ustrs[n],TRUE,i);
594 } else {
595 FB_TRACE("build",abbr,FALSE,i);
596 status = U_MEMORY_ALLOCATION_ERROR;
597 return NULL;
598 }
599 partials[n] = 0; // default: not partial
600 n++;
601 }
602 // first pass - find partials.
603 for(int i=0;i<subCount;i++) {
604 int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
605 if(nn>-1 && (nn+1)!=ustrs[i].length()) {
606 FB_TRACE("partial",&ustrs[i],FALSE,i);
607 // is partial.
608 // is it unique?
609 int sameAs = -1;
610 for(int j=0;j<subCount;j++) {
611 if(j==i) continue;
612 if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
613 FB_TRACE("prefix",&ustrs[j],FALSE,nn+1);
614 //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
615 if(partials[j]==0) { // hasn't been processed yet
616 partials[j] = kSuppressInReverse | kAddToForward;
617 FB_TRACE("suppressing",&ustrs[j],FALSE,j);
618 } else if(partials[j] & kSuppressInReverse) {
619 sameAs = j; // the other entry is already in the reverse table.
620 }
621 }
622 }
623 FB_TRACE("for partial same-",&ustrs[i],FALSE,sameAs);
624 FB_TRACE(" == partial #",&ustrs[i],FALSE,partials[i]);
625 UnicodeString prefix(ustrs[i], 0, nn+1);
626 if(sameAs == -1 && partials[i] == 0) {
627 // first one - add the prefix to the reverse table.
628 prefix.reverse();
629 builder->add(prefix, kPARTIAL, status);
630 revCount++;
631 FB_TRACE("Added partial",&prefix,FALSE, i);
632 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
633 partials[i] = kSuppressInReverse | kAddToForward;
634 } else {
635 FB_TRACE("NOT adding partial",&prefix,FALSE, i);
636 FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
637 }
638 }
639 }
640 for(int i=0;i<subCount;i++) {
641 if(partials[i]==0) {
642 ustrs[i].reverse();
643 builder->add(ustrs[i], kMATCH, status);
644 revCount++;
645 FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i);
646 } else {
647 FB_TRACE("Adding fwd",&ustrs[i], FALSE, i);
648
649 // an optimization would be to only add the portion after the '.'
650 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
651 // instead of "Ph.D." since we already know the "Ph." part is a match.
652 // would need the trie to be able to hold 0-length strings, though.
653 builder2->add(ustrs[i], kMATCH, status); // forward
654 fwdCount++;
655 //ustrs[i].reverse();
656 ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
657 }
658 }
659 FB_TRACE("AbbrCount",NULL,FALSE, subCount);
660
661 if(revCount>0) {
662 backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
663 if(U_FAILURE(status)) {
664 FB_TRACE(u_errorName(status),NULL,FALSE, -1);
665 return NULL;
666 }
667 }
668
669 if(fwdCount>0) {
670 forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
671 if(U_FAILURE(status)) {
672 FB_TRACE(u_errorName(status),NULL,FALSE, -1);
673 return NULL;
674 }
675 }
676
677 return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
678 }
679
680
681 // ----------- Base class implementation
682
FilteredBreakIteratorBuilder()683 FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
684 }
685
~FilteredBreakIteratorBuilder()686 FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
687 }
688
689 FilteredBreakIteratorBuilder *
createInstance(const Locale & where,UErrorCode & status)690 FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
691 if(U_FAILURE(status)) return NULL;
692 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status), status);
693 return (U_SUCCESS(status))? ret.orphan(): NULL;
694 }
695
696 FilteredBreakIteratorBuilder *
createInstance(UErrorCode & status)697 FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) {
698 if(U_FAILURE(status)) return NULL;
699 LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status), status);
700 return (U_SUCCESS(status))? ret.orphan(): NULL;
701 }
702
703 U_NAMESPACE_END
704
705 #endif //#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION
706