1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 1999-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 /************************************************************************
9 *   Date        Name        Description
10 *   12/15/99    Madhu        Creation.
11 *   01/12/2000  Madhu        Updated for changed API and added new tests
12 ************************************************************************/
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16 
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20 
21 #include "unicode/brkiter.h"
22 #include "unicode/localpointer.h"
23 #include "unicode/numfmt.h"
24 #include "unicode/rbbi.h"
25 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
26 #include "unicode/regex.h"
27 #endif
28 #include "unicode/schriter.h"
29 #include "unicode/uchar.h"
30 #include "unicode/utf16.h"
31 #include "unicode/ucnv.h"
32 #include "unicode/uniset.h"
33 #include "unicode/uscript.h"
34 #include "unicode/ustring.h"
35 #include "unicode/utext.h"
36 
37 #include "charstr.h"
38 #include "cmemory.h"
39 #include "intltest.h"
40 #include "rbbitst.h"
41 #include "utypeinfo.h"  // for 'typeid' to work
42 #include "uvector.h"
43 #include "uvectr32.h"
44 
45 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
46 #include "unicode/filteredbrk.h"
47 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
48 
49 #define TEST_ASSERT(x) {if (!(x)) { \
50     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
51 
52 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
53     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
54 
55 
56 //---------------------------------------------
57 // runIndexedTest
58 //---------------------------------------------
59 
60 
61 //  Note:  Before adding new tests to this file, check whether the desired test data can
62 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
63 //         it's much less work than writing a new test, diagnostic output in the event of failures
64 //         is good, and the test data file will is shared with ICU4J, so eventually the test
65 //         will run there as well, without additional effort.
66 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)67 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
68 {
69     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
70     fTestParams = params;
71 
72     TESTCASE_AUTO_BEGIN;
73 #if !UCONFIG_NO_FILE_IO
74     TESTCASE_AUTO(TestBug4153072);
75 #endif
76     TESTCASE_AUTO(TestStatusReturn);
77 #if !UCONFIG_NO_FILE_IO
78     TESTCASE_AUTO(TestUnicodeFiles);
79     TESTCASE_AUTO(TestEmptyString);
80 #endif
81     TESTCASE_AUTO(TestGetAvailableLocales);
82     TESTCASE_AUTO(TestGetDisplayName);
83 #if !UCONFIG_NO_FILE_IO
84     TESTCASE_AUTO(TestEndBehaviour);
85     TESTCASE_AUTO(TestWordBreaks);
86     TESTCASE_AUTO(TestWordBoundary);
87     TESTCASE_AUTO(TestLineBreaks);
88     TESTCASE_AUTO(TestSentBreaks);
89     TESTCASE_AUTO(TestExtended);
90 #endif
91 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
92     TESTCASE_AUTO(TestMonkey);
93 #endif
94 #if !UCONFIG_NO_FILE_IO
95     TESTCASE_AUTO(TestBug3818);
96 #endif
97     TESTCASE_AUTO(TestDebug);
98 #if !UCONFIG_NO_FILE_IO
99     TESTCASE_AUTO(TestBug5775);
100 #endif
101     TESTCASE_AUTO(TestBug9983);
102     TESTCASE_AUTO(TestDictRules);
103     TESTCASE_AUTO(TestBug5532);
104     TESTCASE_AUTO(TestBug7547);
105     TESTCASE_AUTO(TestBug12797);
106     TESTCASE_AUTO(TestBug12918);
107     TESTCASE_AUTO_END;
108 }
109 
110 
111 //---------------------------------------------------------------------------
112 //
113 //   class BITestData   Holds a set of Break iterator test data and results
114 //                      Includes
115 //                         - the string data to be broken
116 //                         - a vector of the expected break positions.
117 //                         - a vector of source line numbers for the data,
118 //                               (to help see where errors occured.)
119 //                         - The expected break tag values.
120 //                         - Vectors of actual break positions and tag values.
121 //                         - Functions for comparing actual with expected and
122 //                            reporting errors.
123 //
124 //----------------------------------------------------------------------------
125 class BITestData {
126 public:
127     UnicodeString    fDataToBreak;
128     UVector          fExpectedBreakPositions;
129     UVector          fExpectedTags;
130     UVector          fLineNum;
131     UVector          fActualBreakPositions;   // Test Results.
132     UVector          fActualTags;
133 
134     BITestData(UErrorCode &status);
135     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
136     void             checkResults(const char *heading, RBBITest *test);
137     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
138     void             clearResults();
139 };
140 
141 //
142 // Constructor.
143 //
BITestData(UErrorCode & status)144 BITestData::BITestData(UErrorCode &status)
145 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
146   fActualTags(status)
147 {
148 }
149 
150 //
151 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
152 //                 The macro form collects the line number, which is helpful
153 //                 when tracking down failures.
154 //
155 //                 A null data item is inserted at the start of each test's data
156 //                  to put the starting zero into the data list.  The position saved for
157 //                  each non-null item is its ending position.
158 //
159 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
addDataChunk(const char * data,int32_t tag,int32_t lineNum,UErrorCode status)160 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
161     if (U_FAILURE(status)) {return;}
162     if (data != NULL) {
163         fDataToBreak.append(CharsToUnicodeString(data));
164     }
165     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
166     fExpectedTags.addElement(tag, status);
167     fLineNum.addElement(lineNum, status);
168 }
169 
170 
171 //
172 //  checkResults.   Compare the actual and expected break positions, report any differences.
173 //
checkResults(const char * heading,RBBITest * test)174 void BITestData::checkResults(const char *heading, RBBITest *test) {
175     int32_t   expectedIndex = 0;
176     int32_t   actualIndex = 0;
177 
178     for (;;) {
179         // If we've run through both the expected and actual results vectors, we're done.
180         //   break out of the loop.
181         if (expectedIndex >= fExpectedBreakPositions.size() &&
182             actualIndex   >= fActualBreakPositions.size()) {
183             break;
184         }
185 
186 
187         if (expectedIndex >= fExpectedBreakPositions.size()) {
188             err(heading, test, expectedIndex-1, actualIndex);
189             actualIndex++;
190             continue;
191         }
192 
193         if (actualIndex >= fActualBreakPositions.size()) {
194             err(heading, test, expectedIndex, actualIndex-1);
195             expectedIndex++;
196             continue;
197         }
198 
199         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
200             err(heading, test, expectedIndex, actualIndex);
201             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
202             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
203                 actualIndex++;
204             } else {
205                 expectedIndex++;
206             }
207             continue;
208         }
209 
210         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
211             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
212                 heading, fLineNum.elementAt(expectedIndex),
213                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
214         }
215 
216         actualIndex++;
217         expectedIndex++;
218     }
219 }
220 
221 //
222 //  err   -  An error was found.  Report it, along with information about where the
223 //                                incorrectly broken test data appeared in the source file.
224 //
err(const char * heading,RBBITest * test,int32_t expectedIdx,int32_t actualIdx)225 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
226 {
227     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
228     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
229     int32_t   o        = 0;
230     int32_t   line     = fLineNum.elementAti(expectedIdx);
231     if (expectedIdx > 0) {
232         // The line numbers are off by one because a premature break occurs somewhere
233         //    within the previous item, rather than at the start of the current (expected) item.
234         //    We want to report the offset of the unexpected break from the start of
235         //      this previous item.
236         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
237     }
238     if (actual < expected) {
239         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
240     } else {
241         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
242     }
243 }
244 
245 
clearResults()246 void BITestData::clearResults() {
247     fActualBreakPositions.removeAllElements();
248     fActualTags.removeAllElements();
249 }
250 
251 
252 //--------------------------------------------------------------------------------------
253 //
254 //    RBBITest    constructor and destructor
255 //
256 //--------------------------------------------------------------------------------------
257 
RBBITest()258 RBBITest::RBBITest() {
259     fTestParams = NULL;
260 }
261 
262 
~RBBITest()263 RBBITest::~RBBITest() {
264 }
265 
266 //-----------------------------------------------------------------------------------
267 //
268 //   Test for status {tag} return value from break rules.
269 //        TODO:  a more thorough test.
270 //
271 //-----------------------------------------------------------------------------------
TestStatusReturn()272 void RBBITest::TestStatusReturn() {
273      UnicodeString rulesString1("$Letters = [:L:];\n"
274                                   "$Numbers = [:N:];\n"
275                                   "$Letters+{1};\n"
276                                   "$Numbers+{2};\n"
277                                   "Help\\ /me\\!{4};\n"
278                                   "[^$Letters $Numbers];\n"
279                                   "!.*;\n", -1, US_INV);
280      UnicodeString testString1  = "abc123..abc Help me Help me!";
281                                 // 01234567890123456789012345678
282      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
283      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
284 
285      UErrorCode status=U_ZERO_ERROR;
286      UParseError    parseError;
287 
288      LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
289      if(U_FAILURE(status)) {
290          dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__,  u_errorName(status));
291          return;
292      }
293      int32_t  pos;
294      int32_t  i = 0;
295      bi->setText(testString1);
296      for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
297          if (pos != bounds1[i]) {
298              errln("%s:%d  expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
299              break;
300          }
301 
302          int tag = bi->getRuleStatus();
303          if (tag != brkStatus[i]) {
304              errln("%s:%d  break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
305              break;
306          }
307          i++;
308      }
309 }
310 
311 
printStringBreaks(UText * tstr,int expected[],int expectedCount)312 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
313     UErrorCode status = U_ZERO_ERROR;
314     char name[100];
315     printf("code    alpha extend alphanum type word sent line name\n");
316     int nextExpectedIndex = 0;
317     utext_setNativeIndex(tstr, 0);
318     for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
319         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
320             printf("------------------------------------------------ %d\n", j);
321             ++nextExpectedIndex;
322         }
323 
324         UChar32 c = utext_next32(tstr);
325         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
326         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
327                            u_isUAlphabetic(c),
328                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
329                            u_isalnum(c),
330                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
331                                                   u_charType(c),
332                                                   U_SHORT_PROPERTY_NAME),
333                            u_getPropertyValueName(UCHAR_WORD_BREAK,
334                                                   u_getIntPropertyValue(c,
335                                                           UCHAR_WORD_BREAK),
336                                                   U_SHORT_PROPERTY_NAME),
337                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
338                                    u_getIntPropertyValue(c,
339                                            UCHAR_SENTENCE_BREAK),
340                                    U_SHORT_PROPERTY_NAME),
341                            u_getPropertyValueName(UCHAR_LINE_BREAK,
342                                    u_getIntPropertyValue(c,
343                                            UCHAR_LINE_BREAK),
344                                    U_SHORT_PROPERTY_NAME),
345                            name);
346     }
347 }
348 
349 
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)350 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
351    UErrorCode status = U_ZERO_ERROR;
352    UText *tstr = NULL;
353    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
354    if (U_FAILURE(status)) {
355        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
356        return;
357     }
358    printStringBreaks(tstr, expected, expectedCount);
359    utext_close(tstr);
360 }
361 
362 
TestBug3818()363 void RBBITest::TestBug3818() {
364     UErrorCode  status = U_ZERO_ERROR;
365 
366     // Four Thai words...
367     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
368                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
369     UnicodeString  thaiStr(thaiWordData);
370 
371     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
372     if (U_FAILURE(status) || bi == NULL) {
373         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
374         return;
375     }
376     bi->setText(thaiStr);
377 
378     int32_t  startOfSecondWord = bi->following(1);
379     if (startOfSecondWord != 4) {
380         errln("Fail at file %s, line %d expected start of word at 4, got %d",
381             __FILE__, __LINE__, startOfSecondWord);
382     }
383     startOfSecondWord = bi->following(0);
384     if (startOfSecondWord != 4) {
385         errln("Fail at file %s, line %d expected start of word at 4, got %d",
386             __FILE__, __LINE__, startOfSecondWord);
387     }
388     delete bi;
389 }
390 
391 //----------------------------------------------------------------------------
392 //
393 // generalIteratorTest      Given a break iterator and a set of test data,
394 //                          Run the tests and report the results.
395 //
396 //----------------------------------------------------------------------------
generalIteratorTest(RuleBasedBreakIterator & bi,BITestData & td)397 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
398 {
399 
400     bi.setText(td.fDataToBreak);
401 
402     testFirstAndNext(bi, td);
403 
404     testLastAndPrevious(bi, td);
405 
406     testFollowing(bi, td);
407     testPreceding(bi, td);
408     testIsBoundary(bi, td);
409     doMultipleSelectionTest(bi, td);
410 }
411 
412 
413 //
414 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
415 //                       kind of loop.
416 //
testFirstAndNext(RuleBasedBreakIterator & bi,BITestData & td)417 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
418 {
419     UErrorCode  status = U_ZERO_ERROR;
420     int32_t     p;
421     int32_t     lastP = -1;
422     int32_t     tag;
423 
424     logln("Test first and next");
425     bi.setText(td.fDataToBreak);
426     td.clearResults();
427 
428     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
429         td.fActualBreakPositions.addElement(p, status);  // Save result.
430         tag = bi.getRuleStatus();
431         td.fActualTags.addElement(tag, status);
432         if (p <= lastP) {
433             // If the iterator is not making forward progress, stop.
434             //  No need to raise an error here, it'll be detected in the normal check of results.
435             break;
436         }
437         lastP = p;
438     }
439     td.checkResults("testFirstAndNext", this);
440 }
441 
442 
443 //
444 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
445 //
testLastAndPrevious(RuleBasedBreakIterator & bi,BITestData & td)446 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
447 {
448     UErrorCode  status = U_ZERO_ERROR;
449     int32_t     p;
450     int32_t     lastP  = 0x7ffffffe;
451     int32_t     tag;
452 
453     logln("Test last and previous");
454     bi.setText(td.fDataToBreak);
455     td.clearResults();
456 
457     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
458         // Save break position.  Insert it at start of vector of results, shoving
459         //    already-saved results further towards the end.
460         td.fActualBreakPositions.insertElementAt(p, 0, status);
461         // bi.previous();   // TODO:  Why does this fix things up????
462         // bi.next();
463         tag = bi.getRuleStatus();
464         td.fActualTags.insertElementAt(tag, 0, status);
465         if (p >= lastP) {
466             // If the iterator is not making progress, stop.
467             //  No need to raise an error here, it'll be detected in the normal check of results.
468             break;
469         }
470         lastP = p;
471     }
472     td.checkResults("testLastAndPrevious", this);
473 }
474 
475 
testFollowing(RuleBasedBreakIterator & bi,BITestData & td)476 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
477 {
478     UErrorCode  status = U_ZERO_ERROR;
479     int32_t     p;
480     int32_t     tag;
481     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
482                                  //   cannot be -1; that is returned for DONE.
483     int         i;
484 
485     logln("testFollowing():");
486     bi.setText(td.fDataToBreak);
487     td.clearResults();
488 
489     // Save the starting point, since we won't get that out of following.
490     p = bi.first();
491     td.fActualBreakPositions.addElement(p, status);  // Save result.
492     tag = bi.getRuleStatus();
493     td.fActualTags.addElement(tag, status);
494 
495     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
496         p = bi.following(i);
497         if (p != lastP) {
498             if (p == RuleBasedBreakIterator::DONE) {
499                 break;
500             }
501             // We've reached a new break position.  Save it.
502             td.fActualBreakPositions.addElement(p, status);  // Save result.
503             tag = bi.getRuleStatus();
504             td.fActualTags.addElement(tag, status);
505             lastP = p;
506         }
507     }
508     // The loop normally exits by means of the break in the middle.
509     // Make sure that the index was at the correct position for the break iterator to have
510     //   returned DONE.
511     if (i != td.fDataToBreak.length()) {
512         errln("testFollowing():  iterator returned DONE prematurely.");
513     }
514 
515     // Full check of all results.
516     td.checkResults("testFollowing", this);
517 }
518 
519 
520 
testPreceding(RuleBasedBreakIterator & bi,BITestData & td)521 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
522     UErrorCode  status = U_ZERO_ERROR;
523     int32_t     p;
524     int32_t     tag;
525     int32_t     lastP  = 0x7ffffffe;
526     int         i;
527 
528     logln("testPreceding():");
529     bi.setText(td.fDataToBreak);
530     td.clearResults();
531 
532     p = bi.last();
533     td.fActualBreakPositions.addElement(p, status);
534     tag = bi.getRuleStatus();
535     td.fActualTags.addElement(tag, status);
536 
537     for (i = td.fDataToBreak.length(); i>=-1; i--) {
538         p = bi.preceding(i);
539         if (p != lastP) {
540             if (p == RuleBasedBreakIterator::DONE) {
541                 break;
542             }
543             // We've reached a new break position.  Save it.
544             td.fActualBreakPositions.insertElementAt(p, 0, status);
545             lastP = p;
546             tag = bi.getRuleStatus();
547             td.fActualTags.insertElementAt(tag, 0, status);
548         }
549     }
550     // The loop normally exits by means of the break in the middle.
551     // Make sure that the index was at the correct position for the break iterator to have
552     //   returned DONE.
553     if (i != 0) {
554         errln("testPreceding():  iterator returned DONE prematurely.");
555     }
556 
557     // Full check of all results.
558     td.checkResults("testPreceding", this);
559 }
560 
561 
562 
testIsBoundary(RuleBasedBreakIterator & bi,BITestData & td)563 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
564     UErrorCode  status = U_ZERO_ERROR;
565     int         i;
566     int32_t     tag;
567 
568     logln("testIsBoundary():");
569     bi.setText(td.fDataToBreak);
570     td.clearResults();
571 
572     for (i = 0; i <= td.fDataToBreak.length(); i++) {
573         if (bi.isBoundary(i)) {
574             td.fActualBreakPositions.addElement(i, status);  // Save result.
575             tag = bi.getRuleStatus();
576             td.fActualTags.addElement(tag, status);
577         }
578     }
579     td.checkResults("testIsBoundary: ", this);
580 }
581 
582 
583 
doMultipleSelectionTest(RuleBasedBreakIterator & iterator,BITestData & td)584 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
585 {
586     iterator.setText(td.fDataToBreak);
587 
588     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
589     int32_t offset = iterator.first();
590     int32_t testOffset;
591     int32_t count = 0;
592 
593     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
594 
595     if (*testIterator != iterator)
596         errln("clone() or operator!= failed: two clones compared unequal");
597 
598     do {
599         testOffset = testIterator->first();
600         testOffset = testIterator->next(count);
601         if (offset != testOffset)
602             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
603 
604         if (offset != RuleBasedBreakIterator::DONE) {
605             count++;
606             offset = iterator.next();
607 
608             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
609                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
610                 if (count > 10000 || offset == -1) {
611                     errln("operator== failed too many times. Stopping test.");
612                     if (offset == -1) {
613                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
614                     }
615                     return;
616                 }
617             }
618         }
619     } while (offset != RuleBasedBreakIterator::DONE);
620 
621     // now do it backwards...
622     offset = iterator.last();
623     count = 0;
624 
625     do {
626         testOffset = testIterator->last();
627         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
628         if (offset != testOffset)
629             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
630 
631         if (offset != RuleBasedBreakIterator::DONE) {
632             count--;
633             offset = iterator.previous();
634         }
635     } while (offset != RuleBasedBreakIterator::DONE);
636 
637     delete testIterator;
638 }
639 
640 
641 //---------------------------------------------
642 //
643 //     other tests
644 //
645 //---------------------------------------------
TestEmptyString()646 void RBBITest::TestEmptyString()
647 {
648     UnicodeString text = "";
649     UErrorCode status = U_ZERO_ERROR;
650 
651     BITestData x(status);
652     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
653     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
654     if (U_FAILURE(status))
655     {
656         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
657         return;
658     }
659     generalIteratorTest(*bi, x);
660     delete bi;
661 }
662 
TestGetAvailableLocales()663 void RBBITest::TestGetAvailableLocales()
664 {
665     int32_t locCount = 0;
666     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
667 
668     if (locCount == 0)
669         dataerrln("getAvailableLocales() returned an empty list!");
670     // Just make sure that it's returning good memory.
671     int32_t i;
672     for (i = 0; i < locCount; ++i) {
673         logln(locList[i].getName());
674     }
675 }
676 
677 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()678 void RBBITest::TestGetDisplayName()
679 {
680     UnicodeString   result;
681 
682     BreakIterator::getDisplayName(Locale::getUS(), result);
683     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
684         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
685                 + result);
686 
687     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
688     if (result != "French (France)")
689         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
690                 + result);
691 }
692 /**
693  * Test End Behaviour
694  * @bug 4068137
695  */
TestEndBehaviour()696 void RBBITest::TestEndBehaviour()
697 {
698     UErrorCode status = U_ZERO_ERROR;
699     UnicodeString testString("boo.");
700     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
701     if (U_FAILURE(status))
702     {
703         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
704         return;
705     }
706     wb->setText(testString);
707 
708     if (wb->first() != 0)
709         errln("Didn't get break at beginning of string.");
710     if (wb->next() != 3)
711         errln("Didn't get break before period in \"boo.\"");
712     if (wb->current() != 4 && wb->next() != 4)
713         errln("Didn't get break at end of string.");
714     delete wb;
715 }
716 /*
717  * @bug 4153072
718  */
TestBug4153072()719 void RBBITest::TestBug4153072() {
720     UErrorCode status = U_ZERO_ERROR;
721     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
722     if (U_FAILURE(status))
723     {
724         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
725         return;
726     }
727     UnicodeString str("...Hello, World!...");
728     int32_t begin = 3;
729     int32_t end = str.length() - 3;
730     UBool onBoundary;
731 
732     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
733     iter->adoptText(textIterator);
734     int index;
735     // Note: with the switch to UText, there is no way to restrict the
736     //       iteration range to begin at an index other than zero.
737     //       String character iterators created with a non-zero bound are
738     //         treated by RBBI as being empty.
739     for (index = -1; index < begin + 1; ++index) {
740         onBoundary = iter->isBoundary(index);
741         if (index == 0?  !onBoundary : onBoundary) {
742             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
743                             " and begin index = " + begin);
744         }
745     }
746     delete iter;
747 }
748 
749 
750 //
751 // Test for problem reported by Ashok Matoria on 9 July 2007
752 //    One.<kSoftHyphen><kSpace>Two.
753 //
754 //    Sentence break at start (0) and then on calling next() it breaks at
755 //   'T' of "Two". Now, at this point if I do next() and
756 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
757 //
TestBug5775()758 void RBBITest::TestBug5775() {
759     UErrorCode status = U_ZERO_ERROR;
760     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
761     TEST_ASSERT_SUCCESS(status);
762     if (U_FAILURE(status)) {
763         return;
764     }
765 // Check for status first for better handling of no data errors.
766     TEST_ASSERT(bi != NULL);
767     if (bi == NULL) {
768         return;
769     }
770 
771     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
772     //               01234      56789
773     s = s.unescape();
774     bi->setText(s);
775     int pos = bi->next();
776     TEST_ASSERT(pos == 6);
777     pos = bi->next();
778     TEST_ASSERT(pos == 10);
779     pos = bi->previous();
780     TEST_ASSERT(pos == 6);
781     delete bi;
782 }
783 
784 
785 
786 //------------------------------------------------------------------------------
787 //
788 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
789 //
790 //------------------------------------------------------------------------------
791 
792 struct TestParams {
793     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
794                                            //   Changed out whenever test data changes break type.
795 
796     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
797     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
798     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
799     UVector32       *srcCol;
800 
801     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
802     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
803     CharString       utf8String;           // UTF-8 form of text to break.
804 
TestParamsTestParams805     TestParams(UErrorCode &status) : dataToBreak() {
806         bi               = NULL;
807         expectedBreaks   = new UVector32(status);
808         srcLine          = new UVector32(status);
809         srcCol           = new UVector32(status);
810         textToBreak      = NULL;
811         textMap          = new UVector32(status);
812     }
813 
~TestParamsTestParams814     ~TestParams() {
815         delete bi;
816         delete expectedBreaks;
817         delete srcLine;
818         delete srcCol;
819         utext_close(textToBreak);
820         delete textMap;
821     }
822 
823     int32_t getSrcLine(int32_t bp);
824     int32_t getExpectedBreak(int32_t bp);
825     int32_t getSrcCol(int32_t bp);
826 
827     void setUTF16(UErrorCode &status);
828     void setUTF8(UErrorCode &status);
829 };
830 
831 // Append a UnicodeString to a CharString with UTF-8 encoding.
832 // Substitute any invalid chars.
833 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)834 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
835     if (U_FAILURE(status)) {
836         return;
837     }
838     int32_t utf8Length;
839     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
840                        src.getBuffer(), src.length(),   // UTF-16 data
841                        0xfffd, NULL,                    // Substitution char, number of subs.
842                        &status);
843     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
844         return;
845     }
846     status = U_ZERO_ERROR;
847     int32_t capacity;
848     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
849     u_strToUTF8WithSub(buffer, utf8Length, NULL,
850                        src.getBuffer(), src.length(),
851                        0xfffd, NULL, &status);
852     dest.append(buffer, utf8Length, status);
853 }
854 
855 
setUTF16(UErrorCode & status)856 void TestParams::setUTF16(UErrorCode &status) {
857     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
858     textMap->removeAllElements();
859     for (int32_t i=0; i<dataToBreak.length(); i++) {
860         if (i == dataToBreak.getChar32Start(i)) {
861             textMap->addElement(i, status);
862         } else {
863             textMap->addElement(-1, status);
864         }
865     }
866     textMap->addElement(dataToBreak.length(), status);
867     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
868 }
869 
870 
setUTF8(UErrorCode & status)871 void TestParams::setUTF8(UErrorCode &status) {
872     if (U_FAILURE(status)) {
873         return;
874     }
875     utf8String.clear();
876     CharStringAppend(utf8String, dataToBreak, status);
877     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
878     if (U_FAILURE(status)) {
879         return;
880     }
881 
882     textMap->removeAllElements();
883     int32_t utf16Index = 0;
884     for (;;) {
885         textMap->addElement(utf16Index, status);
886         UChar32 c32 = utext_current32(textToBreak);
887         if (c32 < 0) {
888             break;
889         }
890         utf16Index += U16_LENGTH(c32);
891         utext_next32(textToBreak);
892         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
893             textMap->addElement(-1, status);
894         }
895     }
896     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
897 }
898 
899 
getSrcLine(int32_t bp)900 int32_t TestParams::getSrcLine(int32_t bp) {
901     if (bp >= textMap->size()) {
902         bp = textMap->size() - 1;
903     }
904     int32_t i = 0;
905     for(; bp >= 0 ; --bp) {
906         // Move to a character boundary if we are not on one already.
907         i = textMap->elementAti(bp);
908         if (i >= 0) {
909             break;
910         }
911     }
912     return srcLine->elementAti(i);
913 }
914 
915 
getExpectedBreak(int32_t bp)916 int32_t TestParams::getExpectedBreak(int32_t bp) {
917     if (bp >= textMap->size()) {
918         return 0;
919     }
920     int32_t i = textMap->elementAti(bp);
921     int32_t retVal = 0;
922     if (i >= 0) {
923         retVal = expectedBreaks->elementAti(i);
924     }
925     return retVal;
926 }
927 
928 
getSrcCol(int32_t bp)929 int32_t TestParams::getSrcCol(int32_t bp) {
930     if (bp >= textMap->size()) {
931         bp = textMap->size() - 1;
932     }
933     int32_t i = 0;
934     for(; bp >= 0; --bp) {
935         // Move bp to a character boundary if we are not on one already.
936         i = textMap->elementAti(bp);
937         if (i >= 0) {
938             break;
939         }
940     }
941     return srcCol->elementAti(i);
942 }
943 
944 
executeTest(TestParams * t,UErrorCode & status)945 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
946     int32_t    bp;
947     int32_t    prevBP;
948     int32_t    i;
949 
950     TEST_ASSERT_SUCCESS(status);
951     if (U_FAILURE(status)) {
952         return;
953     }
954 
955     if (t->bi == NULL) {
956         return;
957     }
958 
959     t->bi->setText(t->textToBreak, status);
960     //
961     //  Run the iterator forward
962     //
963     prevBP = -1;
964     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
965         if (prevBP ==  bp) {
966             // Fail for lack of forward progress.
967             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
968                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
969             break;
970         }
971 
972         // Check that there we didn't miss an expected break between the last one
973         //  and this one.
974         for (i=prevBP+1; i<bp; i++) {
975             if (t->getExpectedBreak(i) != 0) {
976                 int expected[] = {0, i};
977                 printStringBreaks(t->dataToBreak, expected, 2);
978                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
979                       i, t->getSrcLine(i), t->getSrcCol(i));
980             }
981         }
982 
983         // Check that the break we did find was expected
984         if (t->getExpectedBreak(bp) == 0) {
985             int expected[] = {0, bp};
986             printStringBreaks(t->textToBreak, expected, 2);
987             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
988                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
989         } else {
990             // The break was expected.
991             //   Check that the {nnn} tag value is correct.
992             int32_t expectedTagVal = t->getExpectedBreak(bp);
993             if (expectedTagVal == -1) {
994                 expectedTagVal = 0;
995             }
996             int32_t line = t->getSrcLine(bp);
997             int32_t rs = t->bi->getRuleStatus();
998             if (rs != expectedTagVal) {
999                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1000                       "          Actual, Expected status = %4d, %4d",
1001                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1002             }
1003         }
1004 
1005         prevBP = bp;
1006     }
1007 
1008     // Verify that there were no missed expected breaks after the last one found
1009     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1010         if (t->getExpectedBreak(i) != 0) {
1011             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1012                       i, t->getSrcLine(i), t->getSrcCol(i));
1013         }
1014     }
1015 
1016     //
1017     //  Run the iterator backwards, verify that the same breaks are found.
1018     //
1019     prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
1020     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1021         if (prevBP ==  bp) {
1022             // Fail for lack of progress.
1023             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1024                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1025             break;
1026         }
1027 
1028         // Check that we didn't miss an expected break between the last one
1029         //  and this one.  (UVector returns zeros for index out of bounds.)
1030         for (i=prevBP-1; i>bp; i--) {
1031             if (t->getExpectedBreak(i) != 0) {
1032                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1033                       i, t->getSrcLine(i), t->getSrcCol(i));
1034             }
1035         }
1036 
1037         // Check that the break we did find was expected
1038         if (t->getExpectedBreak(bp) == 0) {
1039             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1040                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
1041         } else {
1042             // The break was expected.
1043             //   Check that the {nnn} tag value is correct.
1044             int32_t expectedTagVal = t->getExpectedBreak(bp);
1045             if (expectedTagVal == -1) {
1046                 expectedTagVal = 0;
1047             }
1048             int line = t->getSrcLine(bp);
1049             int32_t rs = t->bi->getRuleStatus();
1050             if (rs != expectedTagVal) {
1051                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1052                       "          Actual, Expected status = %4d, %4d",
1053                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1054             }
1055         }
1056 
1057         prevBP = bp;
1058     }
1059 
1060     // Verify that there were no missed breaks prior to the last one found
1061     for (i=prevBP-1; i>=0; i--) {
1062         if (t->getExpectedBreak(i) != 0) {
1063             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1064                       i, t->getSrcLine(i), t->getSrcCol(i));
1065         }
1066     }
1067 
1068     // Check isBoundary()
1069     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1070         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
1071         UBool boundaryFound    = t->bi->isBoundary(i);
1072         if (boundaryExpected != boundaryFound) {
1073             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1074                   "        Expected, Actual= %s, %s",
1075                   i, t->getSrcLine(i), t->getSrcCol(i),
1076                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1077         }
1078     }
1079 
1080     // Check following()
1081     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1082         int32_t actualBreak = t->bi->following(i);
1083         int32_t expectedBreak = BreakIterator::DONE;
1084         for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1085             if (t->getExpectedBreak(j) != 0) {
1086                 expectedBreak = j;
1087                 break;
1088             }
1089         }
1090         if (expectedBreak != actualBreak) {
1091             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1092                   "        Expected, Actual= %d, %d",
1093                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1094         }
1095     }
1096 
1097     // Check preceding()
1098     for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
1099         int32_t actualBreak = t->bi->preceding(i);
1100         int32_t expectedBreak = BreakIterator::DONE;
1101 
1102         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1103         // preceding(trailing byte) will return the index of some preceding code point,
1104         // not the lead byte of the current code point, even though that has a smaller index.
1105         // Therefore, start looking at the expected break data not at i-1, but at
1106         // the start of code point index - 1.
1107         utext_setNativeIndex(t->textToBreak, i);
1108         int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1109         for (; j >= 0; j--) {
1110             if (t->getExpectedBreak(j) != 0) {
1111                 expectedBreak = j;
1112                 break;
1113             }
1114         }
1115         if (expectedBreak != actualBreak) {
1116             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1117                   "        Expected, Actual= %d, %d",
1118                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1119         }
1120     }
1121 }
1122 
1123 
TestExtended()1124 void RBBITest::TestExtended() {
1125 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1126     UErrorCode      status  = U_ZERO_ERROR;
1127     Locale          locale("");
1128 
1129     UnicodeString       rules;
1130     TestParams          tp(status);
1131 
1132     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
1133     if (U_FAILURE(status)) {
1134         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1135     }
1136 
1137 
1138     //
1139     //  Open and read the test data file.
1140     //
1141     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1142     char testFileName[1000];
1143     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1144         errln("Can't open test data.  Path too long.");
1145         return;
1146     }
1147     strcpy(testFileName, testDataDirectory);
1148     strcat(testFileName, "rbbitst.txt");
1149 
1150     int    len;
1151     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1152     if (U_FAILURE(status)) {
1153         return; /* something went wrong, error already output */
1154     }
1155 
1156 
1157     bool skipTest = false; // Skip this test?
1158 
1159     //
1160     //  Put the test data into a UnicodeString
1161     //
1162     UnicodeString testString(FALSE, testFile, len);
1163 
1164     enum EParseState{
1165         PARSE_COMMENT,
1166         PARSE_TAG,
1167         PARSE_DATA,
1168         PARSE_NUM
1169     }
1170     parseState = PARSE_TAG;
1171 
1172     EParseState savedState = PARSE_TAG;
1173 
1174     static const UChar CH_LF        = 0x0a;
1175     static const UChar CH_CR        = 0x0d;
1176     static const UChar CH_HASH      = 0x23;
1177     /*static const UChar CH_PERIOD    = 0x2e;*/
1178     static const UChar CH_LT        = 0x3c;
1179     static const UChar CH_GT        = 0x3e;
1180     static const UChar CH_BACKSLASH = 0x5c;
1181     static const UChar CH_BULLET    = 0x2022;
1182 
1183     int32_t    lineNum  = 1;
1184     int32_t    colStart = 0;
1185     int32_t    column   = 0;
1186     int32_t    charIdx  = 0;
1187 
1188     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1189 
1190     for (charIdx = 0; charIdx < len; ) {
1191         status = U_ZERO_ERROR;
1192         UChar  c = testString.charAt(charIdx);
1193         charIdx++;
1194         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1195             // treat CRLF as a unit
1196             c = CH_LF;
1197             charIdx++;
1198         }
1199         if (c == CH_LF || c == CH_CR) {
1200             lineNum++;
1201             colStart = charIdx;
1202         }
1203         column = charIdx - colStart + 1;
1204 
1205         switch (parseState) {
1206         case PARSE_COMMENT:
1207             if (c == 0x0a || c == 0x0d) {
1208                 parseState = savedState;
1209             }
1210             break;
1211 
1212         case PARSE_TAG:
1213             {
1214             if (c == CH_HASH) {
1215                 parseState = PARSE_COMMENT;
1216                 savedState = PARSE_TAG;
1217                 break;
1218             }
1219             if (u_isUWhiteSpace(c)) {
1220                 break;
1221             }
1222             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1223                 delete tp.bi;
1224                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1225                 skipTest = false;
1226                 charIdx += 5;
1227                 break;
1228             }
1229             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1230                 delete tp.bi;
1231                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1232                 skipTest = false;
1233                 charIdx += 5;
1234                 break;
1235             }
1236             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1237                 delete tp.bi;
1238                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1239                 skipTest = false;
1240                 charIdx += 5;
1241                 break;
1242             }
1243             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1244                 delete tp.bi;
1245                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1246                 skipTest = false;
1247                 charIdx += 5;
1248                 break;
1249             }
1250             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1251                 delete tp.bi;
1252                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1253                 charIdx += 6;
1254                 break;
1255             }
1256 
1257             // <locale  loc_name>
1258             localeMatcher.reset(testString);
1259             if (localeMatcher.lookingAt(charIdx-1, status)) {
1260                 UnicodeString localeName = localeMatcher.group(1, status);
1261                 char localeName8[100];
1262                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1263                 locale = Locale::createFromName(localeName8);
1264                 charIdx += localeMatcher.group(0, status).length() - 1;
1265                 TEST_ASSERT_SUCCESS(status);
1266                 break;
1267             }
1268             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1269                 parseState = PARSE_DATA;
1270                 charIdx += 5;
1271                 tp.dataToBreak = "";
1272                 tp.expectedBreaks->removeAllElements();
1273                 tp.srcCol ->removeAllElements();
1274                 tp.srcLine->removeAllElements();
1275                 break;
1276             }
1277 
1278             errln("line %d: Tag expected in test file.", lineNum);
1279             parseState = PARSE_COMMENT;
1280             savedState = PARSE_DATA;
1281             goto end_test; // Stop the test.
1282             }
1283             break;
1284 
1285         case PARSE_DATA:
1286             if (c == CH_BULLET) {
1287                 int32_t  breakIdx = tp.dataToBreak.length();
1288                 tp.expectedBreaks->setSize(breakIdx+1);
1289                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1290                 tp.srcLine->setSize(breakIdx+1);
1291                 tp.srcLine->setElementAt(lineNum, breakIdx);
1292                 tp.srcCol ->setSize(breakIdx+1);
1293                 tp.srcCol ->setElementAt(column, breakIdx);
1294                 break;
1295             }
1296 
1297             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1298                 // Add final entry to mappings from break location to source file position.
1299                 //  Need one extra because last break position returned is after the
1300                 //    last char in the data, not at the last char.
1301                 tp.srcLine->addElement(lineNum, status);
1302                 tp.srcCol ->addElement(column, status);
1303 
1304                 parseState = PARSE_TAG;
1305                 charIdx += 6;
1306 
1307                 if (!skipTest) {
1308                     // RUN THE TEST!
1309                     status = U_ZERO_ERROR;
1310                     tp.setUTF16(status);
1311                     executeTest(&tp, status);
1312                     TEST_ASSERT_SUCCESS(status);
1313 
1314                     // Run again, this time with UTF-8 text wrapped in a UText.
1315                     status = U_ZERO_ERROR;
1316                     tp.setUTF8(status);
1317                     TEST_ASSERT_SUCCESS(status);
1318                     executeTest(&tp, status);
1319                 }
1320                 break;
1321             }
1322 
1323             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1324                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1325                 // Get the code point from the name and insert it into the test data.
1326                 //   (Damn, no API takes names in Unicode  !!!
1327                 //    we've got to take it back to char *)
1328                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1329                 int32_t nameLength = nameEndIdx - (charIdx+2);
1330                 char charNameBuf[200];
1331                 UChar32 theChar = -1;
1332                 if (nameEndIdx != -1) {
1333                     UErrorCode status = U_ZERO_ERROR;
1334                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1335                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1336                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1337                     if (U_FAILURE(status)) {
1338                         theChar = -1;
1339                     }
1340                 }
1341                 if (theChar == -1) {
1342                     errln("Error in named character in test file at line %d, col %d",
1343                         lineNum, column);
1344                 } else {
1345                     // Named code point was recognized.  Insert it
1346                     //   into the test data.
1347                     tp.dataToBreak.append(theChar);
1348                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1349                         tp.srcLine->addElement(lineNum, status);
1350                         tp.srcCol ->addElement(column, status);
1351                     }
1352                 }
1353                 if (nameEndIdx > charIdx) {
1354                     charIdx = nameEndIdx+1;
1355 
1356                 }
1357                 break;
1358             }
1359 
1360 
1361 
1362 
1363             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1364                 charIdx++;
1365                 int32_t  breakIdx = tp.dataToBreak.length();
1366                 tp.expectedBreaks->setSize(breakIdx+1);
1367                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1368                 tp.srcLine->setSize(breakIdx+1);
1369                 tp.srcLine->setElementAt(lineNum, breakIdx);
1370                 tp.srcCol ->setSize(breakIdx+1);
1371                 tp.srcCol ->setElementAt(column, breakIdx);
1372                 break;
1373             }
1374 
1375             if (c == CH_LT) {
1376                 tagValue   = 0;
1377                 parseState = PARSE_NUM;
1378                 break;
1379             }
1380 
1381             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1382                 parseState = PARSE_COMMENT;
1383                 savedState = PARSE_DATA;
1384                 break;
1385             }
1386 
1387             if (c == CH_BACKSLASH) {
1388                 // Check for \ at end of line, a line continuation.
1389                 //     Advance over (discard) the newline
1390                 UChar32 cp = testString.char32At(charIdx);
1391                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1392                     // We have a CR LF
1393                     //  Need an extra increment of the input ptr to move over both of them
1394                     charIdx++;
1395                 }
1396                 if (cp == CH_LF || cp == CH_CR) {
1397                     lineNum++;
1398                     colStart = charIdx;
1399                     charIdx++;
1400                     break;
1401                 }
1402 
1403                 // Let unescape handle the back slash.
1404                 cp = testString.unescapeAt(charIdx);
1405                 if (cp != -1) {
1406                     // Escape sequence was recognized.  Insert the char
1407                     //   into the test data.
1408                     tp.dataToBreak.append(cp);
1409                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1410                         tp.srcLine->addElement(lineNum, status);
1411                         tp.srcCol ->addElement(column, status);
1412                     }
1413                     break;
1414                 }
1415 
1416 
1417                 // Not a recognized backslash escape sequence.
1418                 // Take the next char as a literal.
1419                 //  TODO:  Should this be an error?
1420                 c = testString.charAt(charIdx);
1421                 charIdx = testString.moveIndex32(charIdx, 1);
1422             }
1423 
1424             // Normal, non-escaped data char.
1425             tp.dataToBreak.append(c);
1426 
1427             // Save the mapping from offset in the data to line/column numbers in
1428             //   the original input file.  Will be used for better error messages only.
1429             //   If there's an expected break before this char, the slot in the mapping
1430             //     vector will already be set for this char; don't overwrite it.
1431             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1432                 tp.srcLine->addElement(lineNum, status);
1433                 tp.srcCol ->addElement(column, status);
1434             }
1435             break;
1436 
1437 
1438         case PARSE_NUM:
1439             // We are parsing an expected numeric tag value, like <1234>,
1440             //   within a chunk of data.
1441             if (u_isUWhiteSpace(c)) {
1442                 break;
1443             }
1444 
1445             if (c == CH_GT) {
1446                 // Finished the number.  Add the info to the expected break data,
1447                 //   and switch parse state back to doing plain data.
1448                 parseState = PARSE_DATA;
1449                 if (tagValue == 0) {
1450                     tagValue = -1;
1451                 }
1452                 int32_t  breakIdx = tp.dataToBreak.length();
1453                 tp.expectedBreaks->setSize(breakIdx+1);
1454                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1455                 tp.srcLine->setSize(breakIdx+1);
1456                 tp.srcLine->setElementAt(lineNum, breakIdx);
1457                 tp.srcCol ->setSize(breakIdx+1);
1458                 tp.srcCol ->setElementAt(column, breakIdx);
1459                 break;
1460             }
1461 
1462             if (u_isdigit(c)) {
1463                 tagValue = tagValue*10 + u_charDigitValue(c);
1464                 break;
1465             }
1466 
1467             errln("Syntax Error in test file at line %d, col %d",
1468                 lineNum, column);
1469             parseState = PARSE_COMMENT;
1470             goto end_test; // Stop the test
1471             break;
1472         }
1473 
1474 
1475         if (U_FAILURE(status)) {
1476             dataerrln("ICU Error %s while parsing test file at line %d.",
1477                 u_errorName(status), lineNum);
1478             status = U_ZERO_ERROR;
1479             goto end_test; // Stop the test
1480         }
1481 
1482     }
1483 
1484 end_test:
1485     delete [] testFile;
1486 #endif
1487 }
1488 
1489 
1490 //-------------------------------------------------------------------------------
1491 //
1492 //  TestDictRules   create a break iterator from source rules that includes a
1493 //                  dictionary range.   Regression for bug #7130.  Source rules
1494 //                  do not declare a break iterator type (word, line, sentence, etc.
1495 //                  but the dictionary code, without a type, would loop.
1496 //
1497 //-------------------------------------------------------------------------------
TestDictRules()1498 void RBBITest::TestDictRules() {
1499     const char *rules =  "$dictionary = [a-z]; \n"
1500                          "!!forward; \n"
1501                          "$dictionary $dictionary; \n"
1502                          "!!reverse; \n"
1503                          "$dictionary $dictionary; \n";
1504     const char *text = "aa";
1505     UErrorCode status = U_ZERO_ERROR;
1506     UParseError parseError;
1507 
1508     RuleBasedBreakIterator bi(rules, parseError, status);
1509     if (U_SUCCESS(status)) {
1510         UnicodeString utext = text;
1511         bi.setText(utext);
1512         int32_t position;
1513         int32_t loops;
1514         for (loops = 0; loops<10; loops++) {
1515             position = bi.next();
1516             if (position == RuleBasedBreakIterator::DONE) {
1517                 break;
1518             }
1519         }
1520         TEST_ASSERT(loops == 1);
1521     } else {
1522         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1523     }
1524 }
1525 
1526 
1527 
1528 //-------------------------------------------------------------------------------
1529 //
1530 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1531 //    return the data in one big UChar * buffer, which the caller must delete.
1532 //
1533 //    parameters:
1534 //          fileName:   the name of the file, with no directory part.  The test data directory
1535 //                      is assumed.
1536 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1537 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1538 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1539 //                      Pass NULL for the system default encoding.
1540 //          status
1541 //    returns:
1542 //                      The file data, converted to UChar.
1543 //                      The caller must delete this when done with
1544 //                           delete [] theBuffer;
1545 //
1546 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1547 //           Move this function to some common place.
1548 //
1549 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)1550 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1551     UChar       *retPtr  = NULL;
1552     char        *fileBuf = NULL;
1553     UConverter* conv     = NULL;
1554     FILE        *f       = NULL;
1555 
1556     ulen = 0;
1557     if (U_FAILURE(status)) {
1558         return retPtr;
1559     }
1560 
1561     //
1562     //  Open the file.
1563     //
1564     f = fopen(fileName, "rb");
1565     if (f == 0) {
1566         dataerrln("Error opening test data file %s\n", fileName);
1567         status = U_FILE_ACCESS_ERROR;
1568         return NULL;
1569     }
1570     //
1571     //  Read it in
1572     //
1573     int   fileSize;
1574     int   amt_read;
1575 
1576     fseek( f, 0, SEEK_END);
1577     fileSize = ftell(f);
1578     fileBuf = new char[fileSize];
1579     fseek(f, 0, SEEK_SET);
1580     amt_read = fread(fileBuf, 1, fileSize, f);
1581     if (amt_read != fileSize || fileSize <= 0) {
1582         errln("Error reading test data file.");
1583         goto cleanUpAndReturn;
1584     }
1585 
1586     //
1587     // Look for a Unicode Signature (BOM) on the data just read
1588     //
1589     int32_t        signatureLength;
1590     const char *   fileBufC;
1591     const char*    bomEncoding;
1592 
1593     fileBufC = fileBuf;
1594     bomEncoding = ucnv_detectUnicodeSignature(
1595         fileBuf, fileSize, &signatureLength, &status);
1596     if(bomEncoding!=NULL ){
1597         fileBufC  += signatureLength;
1598         fileSize  -= signatureLength;
1599         encoding = bomEncoding;
1600     }
1601 
1602     //
1603     // Open a converter to take the rule file to UTF-16
1604     //
1605     conv = ucnv_open(encoding, &status);
1606     if (U_FAILURE(status)) {
1607         goto cleanUpAndReturn;
1608     }
1609 
1610     //
1611     // Convert the rules to UChar.
1612     //  Preflight first to determine required buffer size.
1613     //
1614     ulen = ucnv_toUChars(conv,
1615         NULL,           //  dest,
1616         0,              //  destCapacity,
1617         fileBufC,
1618         fileSize,
1619         &status);
1620     if (status == U_BUFFER_OVERFLOW_ERROR) {
1621         // Buffer Overflow is expected from the preflight operation.
1622         status = U_ZERO_ERROR;
1623 
1624         retPtr = new UChar[ulen+1];
1625         ucnv_toUChars(conv,
1626             retPtr,       //  dest,
1627             ulen+1,
1628             fileBufC,
1629             fileSize,
1630             &status);
1631     }
1632 
1633 cleanUpAndReturn:
1634     fclose(f);
1635     delete []fileBuf;
1636     ucnv_close(conv);
1637     if (U_FAILURE(status)) {
1638         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1639         delete []retPtr;
1640         retPtr = 0;
1641         ulen   = 0;
1642     };
1643     return retPtr;
1644 }
1645 
1646 
1647 
1648 //--------------------------------------------------------------------------------------------
1649 //
1650 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1651 //
1652 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1653 void RBBITest::TestUnicodeFiles() {
1654     RuleBasedBreakIterator  *bi;
1655     UErrorCode               status = U_ZERO_ERROR;
1656 
1657     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1658     TEST_ASSERT_SUCCESS(status);
1659     if (U_SUCCESS(status)) {
1660         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1661     }
1662     delete bi;
1663 
1664     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1665     TEST_ASSERT_SUCCESS(status);
1666     if (U_SUCCESS(status)) {
1667         runUnicodeTestData("WordBreakTest.txt", bi);
1668     }
1669     delete bi;
1670 
1671     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1672     TEST_ASSERT_SUCCESS(status);
1673     if (U_SUCCESS(status)) {
1674         runUnicodeTestData("SentenceBreakTest.txt", bi);
1675     }
1676     delete bi;
1677 
1678     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1679     TEST_ASSERT_SUCCESS(status);
1680     if (U_SUCCESS(status)) {
1681         runUnicodeTestData("LineBreakTest.txt", bi);
1682     }
1683     delete bi;
1684 }
1685 
1686 
1687 // Check for test cases from the Unicode test data files that are known to fail
1688 // and should be skipped because ICU is not yet able to fully implement the spec.
1689 // See ticket #7270.
1690 
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1691 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1692     static const UChar badTestCases[][4] = {                     // Line Numbers from Unicode 7.0.0 file.
1693         {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000},   // Line 5198
1694         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000},   // Line 5202
1695         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000},   // Line 5214
1696         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000},   // Line 5246
1697         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000},   // Line 5298
1698         {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000}    // Line 5302
1699     };
1700     if (strcmp(fileName, "LineBreakTest.txt") != 0) {
1701         return FALSE;
1702     }
1703 
1704     for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
1705         if (testCase == UnicodeString(badTestCases[i])) {
1706             return logKnownIssue("7270");
1707         }
1708     }
1709     return FALSE;
1710 }
1711 
1712 
1713 //--------------------------------------------------------------------------------------------
1714 //
1715 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1716 //
1717 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1718 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1719 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1720     UErrorCode  status = U_ZERO_ERROR;
1721 
1722     //
1723     //  Open and read the test data file, put it into a UnicodeString.
1724     //
1725     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1726     char testFileName[1000];
1727     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1728         dataerrln("Can't open test data.  Path too long.");
1729         return;
1730     }
1731     strcpy(testFileName, testDataDirectory);
1732     strcat(testFileName, fileName);
1733 
1734     logln("Opening data file %s\n", fileName);
1735 
1736     int    len;
1737     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1738     if (status != U_FILE_ACCESS_ERROR) {
1739         TEST_ASSERT_SUCCESS(status);
1740         TEST_ASSERT(testFile != NULL);
1741     }
1742     if (U_FAILURE(status) || testFile == NULL) {
1743         return; /* something went wrong, error already output */
1744     }
1745     UnicodeString testFileAsString(TRUE, testFile, len);
1746 
1747     //
1748     //  Parse the test data file using a regular expression.
1749     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1750     //     is identified by which group had a match.
1751     //
1752     //    Caputure Group #                  1          2            3            4           5
1753     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1754     //
1755     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1756     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1757     UnicodeString   testString;
1758     UVector32       breakPositions(status);
1759     int             lineNumber = 1;
1760     TEST_ASSERT_SUCCESS(status);
1761     if (U_FAILURE(status)) {
1762         return;
1763     }
1764 
1765     //
1766     //  Scan through each test case, building up the string to be broken in testString,
1767     //   and the positions that should be boundaries in the breakPositions vector.
1768     //
1769     int spin = 0;
1770     while (tokenMatcher.find()) {
1771       	if(tokenMatcher.hitEnd()) {
1772           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1773              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1774              and caused an infinite loop here on EBCDIC systems!
1775           */
1776           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1777           //	   return;
1778       	}
1779         if (tokenMatcher.start(1, status) >= 0) {
1780             // Scanned a divide sign, indicating a break position in the test data.
1781             if (testString.length()>0) {
1782                 breakPositions.addElement(testString.length(), status);
1783             }
1784         }
1785         else if (tokenMatcher.start(2, status) >= 0) {
1786             // Scanned an 'x', meaning no break at this position in the test data
1787             //   Nothing to be done here.
1788             }
1789         else if (tokenMatcher.start(3, status) >= 0) {
1790             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1791             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1792             int length = hexNumber.length();
1793             if (length<=8) {
1794                 char buf[10];
1795                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1796                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1797                 if (c<=0x10ffff) {
1798                     testString.append(c);
1799                 } else {
1800                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1801                        fileName, lineNumber);
1802                 }
1803             } else {
1804                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1805                        fileName, lineNumber);
1806              }
1807         }
1808         else if (tokenMatcher.start(4, status) >= 0) {
1809             // Scanned to end of a line, possibly skipping over a comment in the process.
1810             //   If the line from the file contained test data, run the test now.
1811             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1812                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1813             }
1814 
1815             // Clear out this test case.
1816             //    The string and breakPositions vector will be refilled as the next
1817             //       test case is parsed.
1818             testString.remove();
1819             breakPositions.removeAllElements();
1820             lineNumber++;
1821         } else {
1822             // Scanner catchall.  Something unrecognized appeared on the line.
1823             char token[16];
1824             UnicodeString uToken = tokenMatcher.group(0, status);
1825             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1826             token[sizeof(token)-1] = 0;
1827             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1828 
1829             // Clean up, in preparation for continuing with the next line.
1830             testString.remove();
1831             breakPositions.removeAllElements();
1832             lineNumber++;
1833         }
1834         TEST_ASSERT_SUCCESS(status);
1835         if (U_FAILURE(status)) {
1836             break;
1837         }
1838     }
1839 
1840     delete [] testFile;
1841  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1842 }
1843 
1844 //--------------------------------------------------------------------------------------------
1845 //
1846 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1847 //                            test data files.  Do only a simple, forward-only check -
1848 //                            this test is mostly to check that ICU and the Unicode
1849 //                            data agree with each other.
1850 //
1851 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1852 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1853                          const UnicodeString &testString,   // Text data to be broken
1854                          UVector32 *breakPositions,         // Positions where breaks should be found.
1855                          RuleBasedBreakIterator *bi) {
1856     int32_t pos;                 // Break Position in the test string
1857     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1858     int32_t expectedPos;         // Expected break position (index into test string)
1859 
1860     bi->setText(testString);
1861     pos = bi->first();
1862     pos = bi->next();
1863 
1864     while (pos != BreakIterator::DONE) {
1865         if (expectedI >= breakPositions->size()) {
1866             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1867                 testFileName, lineNumber, pos);
1868             break;
1869         }
1870         expectedPos = breakPositions->elementAti(expectedI);
1871         if (pos < expectedPos) {
1872             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1873                 testFileName, lineNumber, pos);
1874             break;
1875         }
1876         if (pos > expectedPos) {
1877             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1878                 testFileName, lineNumber, expectedPos);
1879             break;
1880         }
1881         pos = bi->next();
1882         expectedI++;
1883     }
1884 
1885     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1886         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1887             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1888     }
1889 }
1890 
1891 
1892 
1893 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1894 //---------------------------------------------------------------------------------------
1895 //
1896 //   classs RBBIMonkeyKind
1897 //
1898 //      Monkey Test for Break Iteration
1899 //      Abstract interface class.   Concrete derived classes independently
1900 //      implement the break rules for different iterator types.
1901 //
1902 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1903 //      testing, but works purely in terms of the interface defined here.
1904 //
1905 //---------------------------------------------------------------------------------------
1906 class RBBIMonkeyKind {
1907 public:
1908     // Return a UVector of UnicodeSets, representing the character classes used
1909     //   for this type of iterator.
1910     virtual  UVector  *charClasses() = 0;
1911 
1912     // Set the test text on which subsequent calls to next() will operate
1913     virtual  void      setText(const UnicodeString &s) = 0;
1914 
1915     // Find the next break postion, starting from the prev break position, or from zero.
1916     // Return -1 after reaching end of string.
1917     virtual  int32_t   next(int32_t i) = 0;
1918 
1919     virtual ~RBBIMonkeyKind();
1920     UErrorCode       deferredStatus;
1921 
1922 
1923 protected:
1924     RBBIMonkeyKind();
1925 
1926 private:
1927 };
1928 
RBBIMonkeyKind()1929 RBBIMonkeyKind::RBBIMonkeyKind() {
1930     deferredStatus = U_ZERO_ERROR;
1931 }
1932 
~RBBIMonkeyKind()1933 RBBIMonkeyKind::~RBBIMonkeyKind() {
1934 }
1935 
1936 
1937 //----------------------------------------------------------------------------------------
1938 //
1939 //   Random Numbers.  Similar to standard lib rand() and srand()
1940 //                    Not using library to
1941 //                      1.  Get same results on all platforms.
1942 //                      2.  Get access to current seed, to more easily reproduce failures.
1943 //
1944 //---------------------------------------------------------------------------------------
1945 static uint32_t m_seed = 1;
1946 
m_rand()1947 static uint32_t m_rand()
1948 {
1949     m_seed = m_seed * 1103515245 + 12345;
1950     return (uint32_t)(m_seed/65536) % 32768;
1951 }
1952 
1953 
1954 //
1955 // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
1956 //
1957 static const char *gExtended_Pict = "["
1958     "\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093"
1959     "\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
1960     "\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF"
1961     "\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395"
1962     "\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548"
1963     "\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589"
1964     "\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0"
1965     "\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0"
1966     "\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
1967     "\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625"
1968     "\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667"
1969     "\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF"
1970     "\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF"
1971     "\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF"
1972     "\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF"
1973     "\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF"
1974     "\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F"
1975     "\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8"
1976     "\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF"
1977     "]";
1978 
1979 //------------------------------------------------------------------------------------------
1980 //
1981 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1982 //                             of RBBIMonkeyKind.
1983 //
1984 //------------------------------------------------------------------------------------------
1985 class RBBICharMonkey: public RBBIMonkeyKind {
1986 public:
1987     RBBICharMonkey();
1988     virtual          ~RBBICharMonkey();
1989     virtual  UVector *charClasses();
1990     virtual  void     setText(const UnicodeString &s);
1991     virtual  int32_t  next(int32_t i);
1992 private:
1993     UVector   *fSets;
1994 
1995     UnicodeSet  *fCRLFSet;
1996     UnicodeSet  *fControlSet;
1997     UnicodeSet  *fExtendSet;
1998     UnicodeSet  *fZWJSet;
1999     UnicodeSet  *fRegionalIndicatorSet;
2000     UnicodeSet  *fPrependSet;
2001     UnicodeSet  *fSpacingSet;
2002     UnicodeSet  *fLSet;
2003     UnicodeSet  *fVSet;
2004     UnicodeSet  *fTSet;
2005     UnicodeSet  *fLVSet;
2006     UnicodeSet  *fLVTSet;
2007     UnicodeSet  *fHangulSet;
2008     UnicodeSet  *fEmojiBaseSet;
2009     UnicodeSet  *fEmojiModifierSet;
2010     UnicodeSet  *fExtendedPictSet;
2011     UnicodeSet  *fEBGSet;
2012     UnicodeSet  *fEmojiNRKSet;
2013     UnicodeSet  *fAnySet;
2014 
2015     const UnicodeString *fText;
2016 };
2017 
2018 
RBBICharMonkey()2019 RBBICharMonkey::RBBICharMonkey() {
2020     UErrorCode  status = U_ZERO_ERROR;
2021 
2022     fText = NULL;
2023 
2024     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2025     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
2026     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
2027     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
2028     fRegionalIndicatorSet =
2029                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
2030     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2031     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2032     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2033     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2034     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2035     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2036     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2037     fHangulSet  = new UnicodeSet();
2038     fHangulSet->addAll(*fLSet);
2039     fHangulSet->addAll(*fVSet);
2040     fHangulSet->addAll(*fTSet);
2041     fHangulSet->addAll(*fLVSet);
2042     fHangulSet->addAll(*fLVTSet);
2043 
2044     fEmojiBaseSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
2045     fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
2046     fExtendedPictSet  = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
2047     fEBGSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
2048     fEmojiNRKSet      = new UnicodeSet(UNICODE_STRING_SIMPLE(
2049                 "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
2050     fAnySet           = new UnicodeSet(0, 0x10ffff);
2051 
2052     fSets             = new UVector(status);
2053     fSets->addElement(fCRLFSet,    status);
2054     fSets->addElement(fControlSet, status);
2055     fSets->addElement(fExtendSet,  status);
2056     fSets->addElement(fRegionalIndicatorSet, status);
2057     if (!fPrependSet->isEmpty()) {
2058         fSets->addElement(fPrependSet, status);
2059     }
2060     fSets->addElement(fSpacingSet, status);
2061     fSets->addElement(fHangulSet,  status);
2062     fSets->addElement(fAnySet,     status);
2063     fSets->addElement(fEmojiBaseSet, status);
2064     fSets->addElement(fEmojiModifierSet, status);
2065     fSets->addElement(fZWJSet,     status);
2066     fSets->addElement(fExtendedPictSet, status);
2067     fSets->addElement(fEBGSet,     status);
2068     fSets->addElement(fEmojiNRKSet,status);
2069     if (U_FAILURE(status)) {
2070         deferredStatus = status;
2071     }
2072 }
2073 
2074 
setText(const UnicodeString & s)2075 void RBBICharMonkey::setText(const UnicodeString &s) {
2076     fText = &s;
2077 }
2078 
2079 
2080 
next(int32_t prevPos)2081 int32_t RBBICharMonkey::next(int32_t prevPos) {
2082     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2083                               //   break position being tested.  The candidate break
2084                               //   location is before p2.
2085 
2086     int     breakPos = -1;
2087 
2088     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2089     UChar32 cBase;            // for (X Extend*) patterns, the X character.
2090 
2091     if (U_FAILURE(deferredStatus)) {
2092         return -1;
2093     }
2094 
2095     // Previous break at end of string.  return DONE.
2096     if (prevPos >= fText->length()) {
2097         return -1;
2098     }
2099     p0 = p1 = p2 = p3 = prevPos;
2100     c3 =  fText->char32At(prevPos);
2101     c0 = c1 = c2 = cBase = 0;
2102     (void)p0;   // suppress set but not used warning.
2103     (void)c0;
2104 
2105     // Loop runs once per "significant" character position in the input text.
2106     for (;;) {
2107         // Move all of the positions forward in the input string.
2108         p0 = p1;  c0 = c1;
2109         p1 = p2;  c1 = c2;
2110         p2 = p3;  c2 = c3;
2111 
2112         // Advancd p3 by one codepoint
2113         p3 = fText->moveIndex32(p3, 1);
2114         c3 = fText->char32At(p3);
2115 
2116         if (p1 == p2) {
2117             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2118             continue;
2119         }
2120         if (p2 == fText->length()) {
2121             // Reached end of string.  Always a break position.
2122             break;
2123         }
2124 
2125         // Rule  GB3   CR x LF
2126         //     No Extend or Format characters may appear between the CR and LF,
2127         //     which requires the additional check for p2 immediately following p1.
2128         //
2129         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2130             continue;
2131         }
2132 
2133         // Rule (GB4).   ( Control | CR | LF ) <break>
2134         if (fControlSet->contains(c1) ||
2135             c1 == 0x0D ||
2136             c1 == 0x0A)  {
2137             break;
2138         }
2139 
2140         // Rule (GB5)    <break>  ( Control | CR | LF )
2141         //
2142         if (fControlSet->contains(c2) ||
2143             c2 == 0x0D ||
2144             c2 == 0x0A)  {
2145             break;
2146         }
2147 
2148 
2149         // Rule (GB6)  L x ( L | V | LV | LVT )
2150         if (fLSet->contains(c1) &&
2151                (fLSet->contains(c2)  ||
2152                 fVSet->contains(c2)  ||
2153                 fLVSet->contains(c2) ||
2154                 fLVTSet->contains(c2))) {
2155             continue;
2156         }
2157 
2158         // Rule (GB7)    ( LV | V )  x  ( V | T )
2159         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2160             (fVSet->contains(c2) || fTSet->contains(c2)))  {
2161             continue;
2162         }
2163 
2164         // Rule (GB8)    ( LVT | T)  x T
2165         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2166             fTSet->contains(c2))  {
2167             continue;
2168         }
2169 
2170         // Rule (GB9)    x (Extend | ZWJ)
2171         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
2172             if (!fExtendSet->contains(c1)) {
2173                 cBase = c1;
2174             }
2175             continue;
2176         }
2177 
2178         // Rule (GB9a)   x  SpacingMark
2179         if (fSpacingSet->contains(c2)) {
2180             continue;
2181         }
2182 
2183         // Rule (GB9b)   Prepend x
2184         if (fPrependSet->contains(c1)) {
2185             continue;
2186         }
2187 
2188         // Rule (GB10)   (Emoji_Base | EBG) Extend * x Emoji_Modifier
2189         if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
2190             continue;
2191         }
2192         if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) &&
2193                 fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
2194             continue;
2195         }
2196 
2197         // Rule (GB11)   (Glue_After_ZWJ | Emoji) ZWJ x (Glue_After_ZWJ | Emoji)
2198         if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) &&
2199                 (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2200             continue;
2201         }
2202 
2203         // Rule (GB12-13)    Regional_Indicator x Regional_Indicator
2204         //                   Note: The first if condition is a little tricky. We only need to force
2205         //                      a break if there are three or more contiguous RIs. If there are
2206         //                      only two, a break following will occur via other rules, and will include
2207         //                      any trailing extend characters, which is needed behavior.
2208         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
2209                 && fRegionalIndicatorSet->contains(c2)) {
2210             break;
2211         }
2212         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2213             continue;
2214         }
2215 
2216         // Rule (GB999)  Any  <break>  Any
2217         break;
2218     }
2219 
2220     breakPos = p2;
2221     return breakPos;
2222 }
2223 
2224 
2225 
charClasses()2226 UVector  *RBBICharMonkey::charClasses() {
2227     return fSets;
2228 }
2229 
2230 
~RBBICharMonkey()2231 RBBICharMonkey::~RBBICharMonkey() {
2232     delete fSets;
2233     delete fCRLFSet;
2234     delete fControlSet;
2235     delete fExtendSet;
2236     delete fRegionalIndicatorSet;
2237     delete fPrependSet;
2238     delete fSpacingSet;
2239     delete fLSet;
2240     delete fVSet;
2241     delete fTSet;
2242     delete fLVSet;
2243     delete fLVTSet;
2244     delete fHangulSet;
2245     delete fAnySet;
2246     delete fEmojiBaseSet;
2247     delete fEmojiModifierSet;
2248     delete fZWJSet;
2249     delete fExtendedPictSet;
2250     delete fEBGSet;
2251     delete fEmojiNRKSet;
2252 }
2253 
2254 //------------------------------------------------------------------------------------------
2255 //
2256 //   class RBBIWordMonkey      Word Break specific implementation
2257 //                             of RBBIMonkeyKind.
2258 //
2259 //------------------------------------------------------------------------------------------
2260 class RBBIWordMonkey: public RBBIMonkeyKind {
2261 public:
2262     RBBIWordMonkey();
2263     virtual          ~RBBIWordMonkey();
2264     virtual  UVector *charClasses();
2265     virtual  void     setText(const UnicodeString &s);
2266     virtual int32_t   next(int32_t i);
2267 private:
2268     UVector      *fSets;
2269 
2270     UnicodeSet  *fCRSet;
2271     UnicodeSet  *fLFSet;
2272     UnicodeSet  *fNewlineSet;
2273     UnicodeSet  *fRegionalIndicatorSet;
2274     UnicodeSet  *fKatakanaSet;
2275     UnicodeSet  *fHebrew_LetterSet;
2276     UnicodeSet  *fALetterSet;
2277     UnicodeSet  *fSingle_QuoteSet;
2278     UnicodeSet  *fDouble_QuoteSet;
2279     UnicodeSet  *fMidNumLetSet;
2280     UnicodeSet  *fMidLetterSet;
2281     UnicodeSet  *fMidNumSet;
2282     UnicodeSet  *fNumericSet;
2283     UnicodeSet  *fFormatSet;
2284     UnicodeSet  *fOtherSet;
2285     UnicodeSet  *fExtendSet;
2286     UnicodeSet  *fExtendNumLetSet;
2287     UnicodeSet  *fDictionarySet;
2288     UnicodeSet  *fEBaseSet;
2289     UnicodeSet  *fEBGSet;
2290     UnicodeSet  *fEModifierSet;
2291     UnicodeSet  *fZWJSet;
2292     UnicodeSet  *fExtendedPictSet;
2293     UnicodeSet  *fEmojiNRKSet;
2294 
2295     const UnicodeString  *fText;
2296 };
2297 
2298 
RBBIWordMonkey()2299 RBBIWordMonkey::RBBIWordMonkey()
2300 {
2301     UErrorCode  status = U_ZERO_ERROR;
2302 
2303     fSets            = new UVector(status);
2304 
2305     fCRSet            = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2306     fLFSet            = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2307     fNewlineSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2308     fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2309     fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2310     fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2311     fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2312     fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
2313     fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
2314     fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2315     fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2316     fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2317     fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2318     fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2319     fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2320     fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2321 
2322     fEBaseSet         = new UnicodeSet(UNICODE_STRING_SIMPLE(
2323             "[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
2324     fEBGSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EBG}]"),          status);
2325     fEModifierSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EM}]"),           status);
2326     fZWJSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ZWJ}]"),          status);
2327     fExtendedPictSet  = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
2328     fEmojiNRKSet      = new UnicodeSet(UNICODE_STRING_SIMPLE(
2329             "[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
2330 
2331     fDictionarySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"), status);
2332     fDictionarySet->addAll(*fKatakanaSet);
2333     fDictionarySet->addAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2334 
2335     fALetterSet->removeAll(*fDictionarySet);
2336 
2337     fOtherSet        = new UnicodeSet();
2338     if(U_FAILURE(status)) {
2339       deferredStatus = status;
2340       return;
2341     }
2342 
2343     fOtherSet->complement();
2344     fOtherSet->removeAll(*fCRSet);
2345     fOtherSet->removeAll(*fLFSet);
2346     fOtherSet->removeAll(*fNewlineSet);
2347     fOtherSet->removeAll(*fKatakanaSet);
2348     fOtherSet->removeAll(*fHebrew_LetterSet);
2349     fOtherSet->removeAll(*fALetterSet);
2350     fOtherSet->removeAll(*fSingle_QuoteSet);
2351     fOtherSet->removeAll(*fDouble_QuoteSet);
2352     fOtherSet->removeAll(*fMidLetterSet);
2353     fOtherSet->removeAll(*fMidNumSet);
2354     fOtherSet->removeAll(*fNumericSet);
2355     fOtherSet->removeAll(*fExtendNumLetSet);
2356     fOtherSet->removeAll(*fFormatSet);
2357     fOtherSet->removeAll(*fExtendSet);
2358     fOtherSet->removeAll(*fRegionalIndicatorSet);
2359     fOtherSet->removeAll(*fEBaseSet);
2360     fOtherSet->removeAll(*fEBGSet);
2361     fOtherSet->removeAll(*fEModifierSet);
2362     fOtherSet->removeAll(*fZWJSet);
2363     fOtherSet->removeAll(*fExtendedPictSet);
2364     fOtherSet->removeAll(*fEmojiNRKSet);
2365 
2366     // Inhibit dictionary characters from being tested at all.
2367     fOtherSet->removeAll(*fDictionarySet);
2368 
2369     fSets->addElement(fCRSet,                status);
2370     fSets->addElement(fLFSet,                status);
2371     fSets->addElement(fNewlineSet,           status);
2372     fSets->addElement(fRegionalIndicatorSet, status);
2373     fSets->addElement(fHebrew_LetterSet,     status);
2374     fSets->addElement(fALetterSet,           status);
2375     fSets->addElement(fSingle_QuoteSet,      status);
2376     fSets->addElement(fDouble_QuoteSet,      status);
2377     //fSets->addElement(fKatakanaSet,          status); // Omit Katakana from fSets, which omits Katakana characters
2378                                                         // from the test data. They are all in the dictionary set,
2379                                                         // which this (old, to be retired) monkey test cannot handle.
2380     fSets->addElement(fMidLetterSet,         status);
2381     fSets->addElement(fMidNumLetSet,         status);
2382     fSets->addElement(fMidNumSet,            status);
2383     fSets->addElement(fNumericSet,           status);
2384     fSets->addElement(fFormatSet,            status);
2385     fSets->addElement(fExtendSet,            status);
2386     fSets->addElement(fOtherSet,             status);
2387     fSets->addElement(fExtendNumLetSet,      status);
2388 
2389     fSets->addElement(fEBaseSet,             status);
2390     fSets->addElement(fEBGSet,               status);
2391     fSets->addElement(fEModifierSet,         status);
2392     fSets->addElement(fZWJSet,               status);
2393     fSets->addElement(fExtendedPictSet,      status);
2394     fSets->addElement(fEmojiNRKSet,          status);
2395 
2396     if (U_FAILURE(status)) {
2397         deferredStatus = status;
2398     }
2399 }
2400 
setText(const UnicodeString & s)2401 void RBBIWordMonkey::setText(const UnicodeString &s) {
2402     fText       = &s;
2403 }
2404 
2405 
next(int32_t prevPos)2406 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2407     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2408                               //   break position being tested.  The candidate break
2409                               //   location is before p2.
2410 
2411     int     breakPos = -1;
2412 
2413     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2414 
2415     if (U_FAILURE(deferredStatus)) {
2416         return -1;
2417     }
2418 
2419     // Prev break at end of string.  return DONE.
2420     if (prevPos >= fText->length()) {
2421         return -1;
2422     }
2423     p0 = p1 = p2 = p3 = prevPos;
2424     c3 =  fText->char32At(prevPos);
2425     c0 = c1 = c2 = 0;
2426     (void)p0;       // Suppress set but not used warning.
2427 
2428     // Loop runs once per "significant" character position in the input text.
2429     for (;;) {
2430         // Move all of the positions forward in the input string.
2431         p0 = p1;  c0 = c1;
2432         p1 = p2;  c1 = c2;
2433         p2 = p3;  c2 = c3;
2434 
2435         // Advancd p3 by    X(Extend | Format)*   Rule 4
2436         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2437         do {
2438             p3 = fText->moveIndex32(p3, 1);
2439             c3 = fText->char32At(p3);
2440             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2441                break;
2442             };
2443         }
2444         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2445 
2446 
2447         if (p1 == p2) {
2448             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2449             continue;
2450         }
2451         if (p2 == fText->length()) {
2452             // Reached end of string.  Always a break position.
2453             break;
2454         }
2455 
2456         // Rule  (3)   CR x LF
2457         //     No Extend or Format characters may appear between the CR and LF,
2458         //     which requires the additional check for p2 immediately following p1.
2459         //
2460         if (c1==0x0D && c2==0x0A) {
2461             continue;
2462         }
2463 
2464         // Rule (3a)  Break before and after newlines (including CR and LF)
2465         //
2466         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2467             break;
2468         };
2469         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2470             break;
2471         };
2472 
2473         // Rule (3c)    ZWJ x (Glue_after_ZWJ | EmojiNRK).
2474         //              Not ignoring extend chars, so peek into input text to
2475         //              get the potential ZWJ, the character immediately preceding c2.
2476         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2477         //              but char32At will get the full code point.
2478         if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2479             continue;
2480         }
2481 
2482         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2483         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2484             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2485             continue;
2486         }
2487 
2488         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2489         //
2490         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2491              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2492              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2493             continue;
2494         }
2495 
2496         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
2497         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2498             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2499             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2500             continue;
2501         }
2502 
2503         // Rule (7a)     Hebrew_Letter x Single_Quote
2504         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2505             continue;
2506         }
2507 
2508         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
2509         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2510             continue;
2511         }
2512 
2513         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
2514         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2515             continue;
2516         }
2517 
2518         // Rule (8)    Numeric x Numeric
2519         if (fNumericSet->contains(c1) &&
2520             fNumericSet->contains(c2))  {
2521             continue;
2522         }
2523 
2524         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
2525         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2526             fNumericSet->contains(c2))  {
2527             continue;
2528         }
2529 
2530         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
2531         if (fNumericSet->contains(c1) &&
2532             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2533             continue;
2534         }
2535 
2536         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
2537         if (fNumericSet->contains(c0) &&
2538             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2539             fNumericSet->contains(c2)) {
2540             continue;
2541         }
2542 
2543         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2544         if (fNumericSet->contains(c1) &&
2545             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2546             fNumericSet->contains(c3)) {
2547             continue;
2548         }
2549 
2550         // Rule (13)  Katakana x Katakana
2551         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2552         //                  all Katakana are handled by the dictionary breaker.
2553         if (fKatakanaSet->contains(c1) &&
2554             fKatakanaSet->contains(c2))  {
2555             continue;
2556         }
2557 
2558         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2559         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2560              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2561              fExtendNumLetSet->contains(c2)) {
2562                 continue;
2563         }
2564 
2565         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2566         if (fExtendNumLetSet->contains(c1) &&
2567                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2568                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2569             continue;
2570         }
2571 
2572         // WB 14  (E_Base | EBG) x E_Modifier
2573         if ((fEBaseSet->contains(c1)  || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) {
2574             continue;
2575         }
2576 
2577         // Rule 15 - 17   Group pairs of Regional Indicators.
2578         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2579             break;
2580         }
2581         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2582             continue;
2583         }
2584 
2585         // Rule 999.  Break found here.
2586         break;
2587     }
2588 
2589     breakPos = p2;
2590     return breakPos;
2591 }
2592 
2593 
charClasses()2594 UVector  *RBBIWordMonkey::charClasses() {
2595     return fSets;
2596 }
2597 
2598 
~RBBIWordMonkey()2599 RBBIWordMonkey::~RBBIWordMonkey() {
2600     delete fSets;
2601     delete fCRSet;
2602     delete fLFSet;
2603     delete fNewlineSet;
2604     delete fKatakanaSet;
2605     delete fHebrew_LetterSet;
2606     delete fALetterSet;
2607     delete fSingle_QuoteSet;
2608     delete fDouble_QuoteSet;
2609     delete fMidNumLetSet;
2610     delete fMidLetterSet;
2611     delete fMidNumSet;
2612     delete fNumericSet;
2613     delete fFormatSet;
2614     delete fExtendSet;
2615     delete fExtendNumLetSet;
2616     delete fRegionalIndicatorSet;
2617     delete fDictionarySet;
2618     delete fOtherSet;
2619     delete fEBaseSet;
2620     delete fEBGSet;
2621     delete fEModifierSet;
2622     delete fZWJSet;
2623     delete fExtendedPictSet;
2624     delete fEmojiNRKSet;
2625 }
2626 
2627 
2628 
2629 
2630 //------------------------------------------------------------------------------------------
2631 //
2632 //   class RBBISentMonkey      Sentence Break specific implementation
2633 //                             of RBBIMonkeyKind.
2634 //
2635 //------------------------------------------------------------------------------------------
2636 class RBBISentMonkey: public RBBIMonkeyKind {
2637 public:
2638     RBBISentMonkey();
2639     virtual          ~RBBISentMonkey();
2640     virtual  UVector *charClasses();
2641     virtual  void     setText(const UnicodeString &s);
2642     virtual int32_t   next(int32_t i);
2643 private:
2644     int               moveBack(int posFrom);
2645     int               moveForward(int posFrom);
2646     UChar32           cAt(int pos);
2647 
2648     UVector      *fSets;
2649 
2650     UnicodeSet  *fSepSet;
2651     UnicodeSet  *fFormatSet;
2652     UnicodeSet  *fSpSet;
2653     UnicodeSet  *fLowerSet;
2654     UnicodeSet  *fUpperSet;
2655     UnicodeSet  *fOLetterSet;
2656     UnicodeSet  *fNumericSet;
2657     UnicodeSet  *fATermSet;
2658     UnicodeSet  *fSContinueSet;
2659     UnicodeSet  *fSTermSet;
2660     UnicodeSet  *fCloseSet;
2661     UnicodeSet  *fOtherSet;
2662     UnicodeSet  *fExtendSet;
2663 
2664     const UnicodeString  *fText;
2665 
2666 };
2667 
RBBISentMonkey()2668 RBBISentMonkey::RBBISentMonkey()
2669 {
2670     UErrorCode  status = U_ZERO_ERROR;
2671 
2672     fSets            = new UVector(status);
2673 
2674     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2675     //                       set and made into character classes of their own.  For the monkey impl,
2676     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2677     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2678     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2679     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2680     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2681     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2682     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2683     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2684     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2685     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2686     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2687     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2688     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2689     fOtherSet        = new UnicodeSet();
2690 
2691     if(U_FAILURE(status)) {
2692       deferredStatus = status;
2693       return;
2694     }
2695 
2696     fOtherSet->complement();
2697     fOtherSet->removeAll(*fSepSet);
2698     fOtherSet->removeAll(*fFormatSet);
2699     fOtherSet->removeAll(*fSpSet);
2700     fOtherSet->removeAll(*fLowerSet);
2701     fOtherSet->removeAll(*fUpperSet);
2702     fOtherSet->removeAll(*fOLetterSet);
2703     fOtherSet->removeAll(*fNumericSet);
2704     fOtherSet->removeAll(*fATermSet);
2705     fOtherSet->removeAll(*fSContinueSet);
2706     fOtherSet->removeAll(*fSTermSet);
2707     fOtherSet->removeAll(*fCloseSet);
2708     fOtherSet->removeAll(*fExtendSet);
2709 
2710     fSets->addElement(fSepSet,       status);
2711     fSets->addElement(fFormatSet,    status);
2712     fSets->addElement(fSpSet,        status);
2713     fSets->addElement(fLowerSet,     status);
2714     fSets->addElement(fUpperSet,     status);
2715     fSets->addElement(fOLetterSet,   status);
2716     fSets->addElement(fNumericSet,   status);
2717     fSets->addElement(fATermSet,     status);
2718     fSets->addElement(fSContinueSet, status);
2719     fSets->addElement(fSTermSet,     status);
2720     fSets->addElement(fCloseSet,     status);
2721     fSets->addElement(fOtherSet,     status);
2722     fSets->addElement(fExtendSet,    status);
2723 
2724     if (U_FAILURE(status)) {
2725         deferredStatus = status;
2726     }
2727 }
2728 
2729 
2730 
setText(const UnicodeString & s)2731 void RBBISentMonkey::setText(const UnicodeString &s) {
2732     fText       = &s;
2733 }
2734 
charClasses()2735 UVector  *RBBISentMonkey::charClasses() {
2736     return fSets;
2737 }
2738 
2739 
2740 //  moveBack()   Find the "significant" code point preceding the index i.
2741 //               Skips over ($Extend | $Format)* .
2742 //
moveBack(int i)2743 int RBBISentMonkey::moveBack(int i) {
2744     if (i <= 0) {
2745         return -1;
2746     }
2747     UChar32   c;
2748     int32_t   j = i;
2749     do {
2750         j = fText->moveIndex32(j, -1);
2751         c = fText->char32At(j);
2752     }
2753     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2754     return j;
2755 
2756  }
2757 
2758 
moveForward(int i)2759 int RBBISentMonkey::moveForward(int i) {
2760     if (i>=fText->length()) {
2761         return fText->length();
2762     }
2763     UChar32   c;
2764     int32_t   j = i;
2765     do {
2766         j = fText->moveIndex32(j, 1);
2767         c = cAt(j);
2768     }
2769     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2770     return j;
2771 }
2772 
cAt(int pos)2773 UChar32 RBBISentMonkey::cAt(int pos) {
2774     if (pos<0 || pos>=fText->length()) {
2775         return -1;
2776     } else {
2777         return fText->char32At(pos);
2778     }
2779 }
2780 
next(int32_t prevPos)2781 int32_t RBBISentMonkey::next(int32_t prevPos) {
2782     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2783                               //   break position being tested.  The candidate break
2784                               //   location is before p2.
2785 
2786     int     breakPos = -1;
2787 
2788     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2789     UChar32 c;
2790 
2791     if (U_FAILURE(deferredStatus)) {
2792         return -1;
2793     }
2794 
2795     // Prev break at end of string.  return DONE.
2796     if (prevPos >= fText->length()) {
2797         return -1;
2798     }
2799     p0 = p1 = p2 = p3 = prevPos;
2800     c3 =  fText->char32At(prevPos);
2801     c0 = c1 = c2 = 0;
2802     (void)p0;     // Suppress set but not used warning.
2803 
2804     // Loop runs once per "significant" character position in the input text.
2805     for (;;) {
2806         // Move all of the positions forward in the input string.
2807         p0 = p1;  c0 = c1;
2808         p1 = p2;  c1 = c2;
2809         p2 = p3;  c2 = c3;
2810 
2811         // Advancd p3 by    X(Extend | Format)*   Rule 4
2812         p3 = moveForward(p3);
2813         c3 = cAt(p3);
2814 
2815         // Rule (3)  CR x LF
2816         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2817             continue;
2818         }
2819 
2820         // Rule (4).   Sep  <break>
2821         if (fSepSet->contains(c1)) {
2822             p2 = p1+1;   // Separators don't combine with Extend or Format.
2823             break;
2824         }
2825 
2826         if (p2 >= fText->length()) {
2827             // Reached end of string.  Always a break position.
2828             break;
2829         }
2830 
2831         if (p2 == prevPos) {
2832             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2833             continue;
2834         }
2835 
2836         // Rule (6).   ATerm x Numeric
2837         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2838             continue;
2839         }
2840 
2841         // Rule (7).  (Upper | Lower) ATerm  x  Uppper
2842         if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2843                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2844             continue;
2845         }
2846 
2847         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2848         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2849         //                  note to the Unicode 5.0 documents.
2850         int p8 = p1;
2851         while (fSpSet->contains(cAt(p8))) {
2852             p8 = moveBack(p8);
2853         }
2854         while (fCloseSet->contains(cAt(p8))) {
2855             p8 = moveBack(p8);
2856         }
2857         if (fATermSet->contains(cAt(p8))) {
2858             p8=p2;
2859             for (;;) {
2860                 c = cAt(p8);
2861                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2862                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2863                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2864                     break;
2865                 }
2866                 p8 = moveForward(p8);
2867             }
2868             if (fLowerSet->contains(cAt(p8))) {
2869                 continue;
2870             }
2871         }
2872 
2873         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2874         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2875             p8 = p1;
2876             while (fSpSet->contains(cAt(p8))) {
2877                 p8 = moveBack(p8);
2878             }
2879             while (fCloseSet->contains(cAt(p8))) {
2880                 p8 = moveBack(p8);
2881             }
2882             c = cAt(p8);
2883             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2884                 continue;
2885             }
2886         }
2887 
2888         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2889         int p9 = p1;
2890         while (fCloseSet->contains(cAt(p9))) {
2891             p9 = moveBack(p9);
2892         }
2893         c = cAt(p9);
2894         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2895             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2896                 continue;
2897             }
2898         }
2899 
2900         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2901         int p10 = p1;
2902         while (fSpSet->contains(cAt(p10))) {
2903             p10 = moveBack(p10);
2904         }
2905         while (fCloseSet->contains(cAt(p10))) {
2906             p10 = moveBack(p10);
2907         }
2908         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2909             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2910                 continue;
2911             }
2912         }
2913 
2914         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2915         int p11 = p1;
2916         if (fSepSet->contains(cAt(p11))) {
2917             p11 = moveBack(p11);
2918         }
2919         while (fSpSet->contains(cAt(p11))) {
2920             p11 = moveBack(p11);
2921         }
2922         while (fCloseSet->contains(cAt(p11))) {
2923             p11 = moveBack(p11);
2924         }
2925         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2926             break;
2927         }
2928 
2929         //  Rule (12)  Any x Any
2930         continue;
2931     }
2932     breakPos = p2;
2933     return breakPos;
2934 }
2935 
~RBBISentMonkey()2936 RBBISentMonkey::~RBBISentMonkey() {
2937     delete fSets;
2938     delete fSepSet;
2939     delete fFormatSet;
2940     delete fSpSet;
2941     delete fLowerSet;
2942     delete fUpperSet;
2943     delete fOLetterSet;
2944     delete fNumericSet;
2945     delete fATermSet;
2946     delete fSContinueSet;
2947     delete fSTermSet;
2948     delete fCloseSet;
2949     delete fOtherSet;
2950     delete fExtendSet;
2951 }
2952 
2953 
2954 
2955 //-------------------------------------------------------------------------------------------
2956 //
2957 //  RBBILineMonkey
2958 //
2959 //-------------------------------------------------------------------------------------------
2960 
2961 class RBBILineMonkey: public RBBIMonkeyKind {
2962 public:
2963     RBBILineMonkey();
2964     virtual          ~RBBILineMonkey();
2965     virtual  UVector *charClasses();
2966     virtual  void     setText(const UnicodeString &s);
2967     virtual  int32_t  next(int32_t i);
2968     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2969 private:
2970     UVector      *fSets;
2971 
2972     UnicodeSet  *fBK;
2973     UnicodeSet  *fCR;
2974     UnicodeSet  *fLF;
2975     UnicodeSet  *fCM;
2976     UnicodeSet  *fNL;
2977     UnicodeSet  *fSG;
2978     UnicodeSet  *fWJ;
2979     UnicodeSet  *fZW;
2980     UnicodeSet  *fGL;
2981     UnicodeSet  *fCB;
2982     UnicodeSet  *fSP;
2983     UnicodeSet  *fB2;
2984     UnicodeSet  *fBA;
2985     UnicodeSet  *fBB;
2986     UnicodeSet  *fHY;
2987     UnicodeSet  *fH2;
2988     UnicodeSet  *fH3;
2989     UnicodeSet  *fCL;
2990     UnicodeSet  *fCP;
2991     UnicodeSet  *fEX;
2992     UnicodeSet  *fIN;
2993     UnicodeSet  *fJL;
2994     UnicodeSet  *fJV;
2995     UnicodeSet  *fJT;
2996     UnicodeSet  *fNS;
2997     UnicodeSet  *fOP;
2998     UnicodeSet  *fQU;
2999     UnicodeSet  *fIS;
3000     UnicodeSet  *fNU;
3001     UnicodeSet  *fPO;
3002     UnicodeSet  *fPR;
3003     UnicodeSet  *fSY;
3004     UnicodeSet  *fAI;
3005     UnicodeSet  *fAL;
3006     UnicodeSet  *fCJ;
3007     UnicodeSet  *fHL;
3008     UnicodeSet  *fID;
3009     UnicodeSet  *fRI;
3010     UnicodeSet  *fXX;
3011     UnicodeSet  *fEB;
3012     UnicodeSet  *fEM;
3013     UnicodeSet  *fZJ;
3014     UnicodeSet  *fExtendedPict;
3015     UnicodeSet  *fEmojiNRK;
3016 
3017     BreakIterator        *fCharBI;
3018     const UnicodeString  *fText;
3019     RegexMatcher         *fNumberMatcher;
3020 };
3021 
RBBILineMonkey()3022 RBBILineMonkey::RBBILineMonkey() :
3023     RBBIMonkeyKind(),
3024     fSets(NULL),
3025 
3026     fCharBI(NULL),
3027     fText(NULL),
3028     fNumberMatcher(NULL)
3029 
3030 {
3031     if (U_FAILURE(deferredStatus)) {
3032         return;
3033     }
3034 
3035     UErrorCode  status = U_ZERO_ERROR;
3036 
3037     fSets  = new UVector(status);
3038 
3039     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3040     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3041     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3042     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3043     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3044     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3045     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3046     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3047     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3048     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3049     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3050     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3051     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3052     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3053     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3054     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3055     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3056     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
3057     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3058     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3059     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3060     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3061     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3062     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3063     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3064     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3065     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3066     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3067     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3068     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3069     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3070     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3071     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3072     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
3073     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
3074     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3075     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
3076     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3077     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3078     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE(
3079             "[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
3080     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
3081     fZJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
3082     fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
3083     fExtendedPict = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
3084 
3085     if (U_FAILURE(status)) {
3086         deferredStatus = status;
3087         return;
3088     }
3089 
3090     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
3091     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
3092     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
3093 
3094     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
3095     fCM->addAll(*fZJ);     // ZWJ behaves as a CM.
3096 
3097     fSets->addElement(fBK, status);
3098     fSets->addElement(fCR, status);
3099     fSets->addElement(fLF, status);
3100     fSets->addElement(fCM, status);
3101     fSets->addElement(fNL, status);
3102     fSets->addElement(fWJ, status);
3103     fSets->addElement(fZW, status);
3104     fSets->addElement(fGL, status);
3105     fSets->addElement(fCB, status);
3106     fSets->addElement(fSP, status);
3107     fSets->addElement(fB2, status);
3108     fSets->addElement(fBA, status);
3109     fSets->addElement(fBB, status);
3110     fSets->addElement(fHY, status);
3111     fSets->addElement(fH2, status);
3112     fSets->addElement(fH3, status);
3113     fSets->addElement(fCL, status);
3114     fSets->addElement(fCP, status);
3115     fSets->addElement(fEX, status);
3116     fSets->addElement(fIN, status);
3117     fSets->addElement(fJL, status);
3118     fSets->addElement(fJT, status);
3119     fSets->addElement(fJV, status);
3120     fSets->addElement(fNS, status);
3121     fSets->addElement(fOP, status);
3122     fSets->addElement(fQU, status);
3123     fSets->addElement(fIS, status);
3124     fSets->addElement(fNU, status);
3125     fSets->addElement(fPO, status);
3126     fSets->addElement(fPR, status);
3127     fSets->addElement(fSY, status);
3128     fSets->addElement(fAI, status);
3129     fSets->addElement(fAL, status);
3130     fSets->addElement(fHL, status);
3131     fSets->addElement(fID, status);
3132     fSets->addElement(fWJ, status);
3133     fSets->addElement(fRI, status);
3134     fSets->addElement(fSG, status);
3135     fSets->addElement(fEB, status);
3136     fSets->addElement(fEM, status);
3137     fSets->addElement(fZJ, status);
3138     fSets->addElement(fExtendedPict, status);
3139     fSets->addElement(fEmojiNRK, status);
3140 
3141 
3142     const char *rules =
3143             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
3144             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
3145             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
3146             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
3147             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
3148             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
3149 
3150     fNumberMatcher = new RegexMatcher(
3151         UnicodeString(rules, -1, US_INV), 0, status);
3152 
3153     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3154 
3155     if (U_FAILURE(status)) {
3156         deferredStatus = status;
3157     }
3158 }
3159 
3160 
setText(const UnicodeString & s)3161 void RBBILineMonkey::setText(const UnicodeString &s) {
3162     fText       = &s;
3163     fCharBI->setText(s);
3164     fNumberMatcher->reset(s);
3165 }
3166 
3167 //
3168 //  rule9Adjust
3169 //     Line Break TR rules 9 and 10 implementation.
3170 //     This deals with combining marks and other sequences that
3171 //     that must be treated as if they were something other than what they actually are.
3172 //
3173 //     This is factored out into a separate function because it must be applied twice for
3174 //     each potential break, once to the chars before the position being checked, then
3175 //     again to the text following the possible break.
3176 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)3177 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3178     if (pos == -1) {
3179         // Invalid initial position.  Happens during the warmup iteration of the
3180         //   main loop in next().
3181         return;
3182     }
3183 
3184     int32_t  nPos = *nextPos;
3185 
3186     // LB 9  Keep combining sequences together.
3187     //  advance over any CM class chars.  Note that Line Break CM is different
3188     //  from the normal Grapheme Extend property.
3189     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3190           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3191         for (;;) {
3192             *nextChar = fText->char32At(nPos);
3193             if (!fCM->contains(*nextChar)) {
3194                 break;
3195             }
3196             nPos = fText->moveIndex32(nPos, 1);
3197         }
3198     }
3199 
3200 
3201     // LB 9 Treat X CM* as if it were x.
3202     //       No explicit action required.
3203 
3204     // LB 10  Treat any remaining combining mark as AL
3205     if (fCM->contains(*posChar)) {
3206         *posChar = 0x41;   // thisChar = 'A';
3207     }
3208 
3209     // Push the updated nextPos and nextChar back to our caller.
3210     // This only makes a difference if posChar got bigger by consuming a
3211     // combining sequence.
3212     *nextPos  = nPos;
3213     *nextChar = fText->char32At(nPos);
3214 }
3215 
3216 
3217 
next(int32_t startPos)3218 int32_t RBBILineMonkey::next(int32_t startPos) {
3219     UErrorCode status = U_ZERO_ERROR;
3220     int32_t    pos;       //  Index of the char following a potential break position
3221     UChar32    thisChar;  //  Character at above position "pos"
3222 
3223     int32_t    prevPos;   //  Index of the char preceding a potential break position
3224     UChar32    prevChar;  //  Character at above position.  Note that prevChar
3225                           //   and thisChar may not be adjacent because combining
3226                           //   characters between them will be ignored.
3227 
3228     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
3229     UChar32    prevCharX2;
3230 
3231     int32_t    nextPos;   //  Index of the next character following pos.
3232                           //     Usually skips over combining marks.
3233     int32_t    nextCPPos; //  Index of the code point following "pos."
3234                           //     May point to a combining mark.
3235     int32_t    tPos;      //  temp value.
3236     UChar32    c;
3237 
3238     if (U_FAILURE(deferredStatus)) {
3239         return -1;
3240     }
3241 
3242     if (startPos >= fText->length()) {
3243         return -1;
3244     }
3245 
3246 
3247     // Initial values for loop.  Loop will run the first time without finding breaks,
3248     //                           while the invalid values shift out and the "this" and
3249     //                           "prev" positions are filled in with good values.
3250     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
3251     thisChar = prevChar  = prevCharX2 = 0;
3252     nextPos  = nextCPPos = startPos;
3253 
3254 
3255     // Loop runs once per position in the test text, until a break position
3256     //  is found.
3257     for (;;) {
3258         prevPosX2 = prevPos;
3259         prevCharX2 = prevChar;
3260 
3261         prevPos   = pos;
3262         prevChar  = thisChar;
3263 
3264         pos       = nextPos;
3265         thisChar  = fText->char32At(pos);
3266 
3267         nextCPPos = fText->moveIndex32(pos, 1);
3268         nextPos   = nextCPPos;
3269 
3270         // Rule LB2 - Break at end of text.
3271         if (pos >= fText->length()) {
3272             break;
3273         }
3274 
3275         // Rule LB 9 - adjust for combining sequences.
3276         //             We do this one out-of-order because the adjustment does not change anything
3277         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3278         //             be applied.
3279         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3280         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3281         c = fText->char32At(nextPos);
3282         rule9Adjust(pos,     &thisChar, &nextPos, &c);
3283 
3284         // If the loop is still warming up - if we haven't shifted the initial
3285         //   -1 positions out of prevPos yet - loop back to advance the
3286         //    position in the input without any further looking for breaks.
3287         if (prevPos == -1) {
3288             continue;
3289         }
3290 
3291         // LB 4  Always break after hard line breaks,
3292         if (fBK->contains(prevChar)) {
3293             break;
3294         }
3295 
3296         // LB 5  Break after CR, LF, NL, but not inside CR LF
3297         if (prevChar == 0x0d && thisChar == 0x0a) {
3298             continue;
3299         }
3300         if (prevChar == 0x0d ||
3301             prevChar == 0x0a ||
3302             prevChar == 0x85)  {
3303             break;
3304         }
3305 
3306         // LB 6  Don't break before hard line breaks
3307         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3308             fBK->contains(thisChar)) {
3309                 continue;
3310         }
3311 
3312 
3313         // LB 7  Don't break before spaces or zero-width space.
3314         if (fSP->contains(thisChar)) {
3315             continue;
3316         }
3317 
3318         if (fZW->contains(thisChar)) {
3319             continue;
3320         }
3321 
3322         // LB 8  Break after zero width space
3323         if (fZW->contains(prevChar)) {
3324             break;
3325         }
3326 
3327         // LB 8a ZWJ x (ID | ExtendedPict | Emoji)
3328         //       The monkey test's way of ignoring combining characters doesn't work
3329         //       for this rule. ZJ is also a CM. Need to get the actual character
3330         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
3331         {
3332             int32_t prevIdx = fText->moveIndex32(pos, -1);
3333             UChar32 prevC = fText->char32At(prevIdx);
3334             if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
3335                 continue;
3336             }
3337         }
3338 
3339         // LB 9, 10  Already done, at top of loop.
3340         //
3341 
3342 
3343         // LB 11  Do not break before or after WORD JOINER and related characters.
3344         //    x  WJ
3345         //    WJ  x
3346         //
3347         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3348             continue;
3349         }
3350 
3351         // LB 12
3352         //    GL  x
3353         if (fGL->contains(prevChar)) {
3354             continue;
3355         }
3356 
3357         // LB 12a
3358         //    [^SP BA HY] x GL
3359         if (!(fSP->contains(prevChar) ||
3360               fBA->contains(prevChar) ||
3361               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3362             continue;
3363         }
3364 
3365 
3366 
3367         // LB 13  Don't break before closings.
3368         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3369         //        fall into LB 17 and the more general number regular expression.
3370         //
3371         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3372             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3373                                          fEX->contains(thisChar)  ||
3374             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3375             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3376             continue;
3377         }
3378 
3379         // LB 14 Don't break after OP SP*
3380         //       Scan backwards, checking for this sequence.
3381         //       The OP char could include combining marks, so we actually check for
3382         //           OP CM* SP*
3383         //       Another Twist: The Rule 67 fixes may have changed a SP CM
3384         //       sequence into a ID char, so before scanning back through spaces,
3385         //       verify that prevChar is indeed a space.  The prevChar variable
3386         //       may differ from fText[prevPos]
3387         tPos = prevPos;
3388         if (fSP->contains(prevChar)) {
3389             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3390                 tPos=fText->moveIndex32(tPos, -1);
3391             }
3392         }
3393         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3394             tPos=fText->moveIndex32(tPos, -1);
3395         }
3396         if (fOP->contains(fText->char32At(tPos))) {
3397             continue;
3398         }
3399 
3400 
3401         // LB 15    QU SP* x OP
3402         if (fOP->contains(thisChar)) {
3403             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3404             int tPos = prevPos;
3405             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3406                 tPos = fText->moveIndex32(tPos, -1);
3407             }
3408             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3409                 tPos = fText->moveIndex32(tPos, -1);
3410             }
3411             if (fQU->contains(fText->char32At(tPos))) {
3412                 continue;
3413             }
3414         }
3415 
3416 
3417 
3418         // LB 16   (CL | CP) SP* x NS
3419         //    Scan backwards for SP* CM* (CL | CP)
3420         if (fNS->contains(thisChar)) {
3421             int tPos = prevPos;
3422             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3423                 tPos = fText->moveIndex32(tPos, -1);
3424             }
3425             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3426                 tPos = fText->moveIndex32(tPos, -1);
3427             }
3428             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3429                 continue;
3430             }
3431         }
3432 
3433 
3434         // LB 17        B2 SP* x B2
3435         if (fB2->contains(thisChar)) {
3436             //  Scan backwards, checking for the B2 CM* SP* sequence.
3437             tPos = prevPos;
3438             if (fSP->contains(prevChar)) {
3439                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3440                     tPos=fText->moveIndex32(tPos, -1);
3441                 }
3442             }
3443             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3444                 tPos=fText->moveIndex32(tPos, -1);
3445             }
3446             if (fB2->contains(fText->char32At(tPos))) {
3447                 continue;
3448             }
3449         }
3450 
3451 
3452         // LB 18    break after space
3453         if (fSP->contains(prevChar)) {
3454             break;
3455         }
3456 
3457         // LB 19
3458         //    x   QU
3459         //    QU  x
3460         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3461             continue;
3462         }
3463 
3464         // LB 20  Break around a CB
3465         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3466             break;
3467         }
3468 
3469         // LB 21
3470         if (fBA->contains(thisChar) ||
3471             fHY->contains(thisChar) ||
3472             fNS->contains(thisChar) ||
3473             fBB->contains(prevChar) )   {
3474             continue;
3475         }
3476 
3477         // LB 21a
3478         //   HL (HY | BA) x
3479         if (fHL->contains(prevCharX2) &&
3480                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3481             continue;
3482         }
3483 
3484         // LB 21b
3485         //   SY x HL
3486         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3487             continue;
3488         }
3489 
3490         // LB 22
3491         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3492             (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3493             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3494             ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
3495             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3496             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3497             continue;
3498         }
3499 
3500 
3501         // LB 23    (AL | HL) x NU
3502         //          NU x (AL | HL)
3503         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3504             continue;
3505         }
3506         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3507             continue;
3508         }
3509 
3510         // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3511         //      PR x (ID | EB | EM)
3512         //     (ID | EB | EM) x PO
3513         if (fPR->contains(prevChar) &&
3514                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3515             continue;
3516         }
3517         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3518                 fPO->contains(thisChar)) {
3519             continue;
3520         }
3521 
3522         // LB 24  Do not break between prefix and letters or ideographs.
3523         //         (PR | PO) x (AL | HL)
3524         //         (AL | HL) x (PR | PO)
3525         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3526                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3527             continue;
3528         }
3529         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3530                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3531             continue;
3532         }
3533 
3534 
3535 
3536         // LB 25    Numbers
3537         if (fNumberMatcher->lookingAt(prevPos, status)) {
3538             if (U_FAILURE(status)) {
3539                 break;
3540             }
3541             // Matched a number.  But could have been just a single digit, which would
3542             //    not represent a "no break here" between prevChar and thisChar
3543             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3544             if (numEndIdx > pos) {
3545                 // Number match includes at least our two chars being checked
3546                 if (numEndIdx > nextPos) {
3547                     // Number match includes additional chars.  Update pos and nextPos
3548                     //   so that next loop iteration will continue at the end of the number,
3549                     //   checking for breaks between last char in number & whatever follows.
3550                     pos = nextPos = numEndIdx;
3551                     do {
3552                         pos = fText->moveIndex32(pos, -1);
3553                         thisChar = fText->char32At(pos);
3554                     } while (fCM->contains(thisChar));
3555                 }
3556                 continue;
3557             }
3558         }
3559 
3560 
3561         // LB 26 Do not break a Korean syllable.
3562         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3563                                         fJV->contains(thisChar) ||
3564                                         fH2->contains(thisChar) ||
3565                                         fH3->contains(thisChar))) {
3566                                             continue;
3567                                         }
3568 
3569         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3570             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3571                 continue;
3572         }
3573 
3574         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3575             fJT->contains(thisChar)) {
3576                 continue;
3577         }
3578 
3579         // LB 27 Treat a Korean Syllable Block the same as ID.
3580         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3581             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3582             fIN->contains(thisChar)) {
3583                 continue;
3584             }
3585         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3586             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3587             fPO->contains(thisChar)) {
3588                 continue;
3589             }
3590         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3591             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3592                 continue;
3593             }
3594 
3595 
3596 
3597         // LB 28  Do not break between alphabetics ("at").
3598         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3599             continue;
3600         }
3601 
3602         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3603         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3604             continue;
3605         }
3606 
3607         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3608         //          (AL | NU) x OP
3609         //          CP x (AL | NU)
3610         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3611             continue;
3612         }
3613         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3614             continue;
3615         }
3616 
3617         // LB30a    RI RI <break> RI
3618         //             RI    x    RI
3619         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3620             break;
3621         }
3622         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3623             continue;
3624         }
3625 
3626         // LB30b    Emoji Base x Emoji Modifier
3627         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3628             continue;
3629         }
3630 
3631         // LB 31    Break everywhere else
3632         break;
3633 
3634     }
3635 
3636     return pos;
3637 }
3638 
3639 
charClasses()3640 UVector  *RBBILineMonkey::charClasses() {
3641     return fSets;
3642 }
3643 
3644 
~RBBILineMonkey()3645 RBBILineMonkey::~RBBILineMonkey() {
3646     delete fSets;
3647 
3648     delete fBK;
3649     delete fCR;
3650     delete fLF;
3651     delete fCM;
3652     delete fNL;
3653     delete fWJ;
3654     delete fZW;
3655     delete fGL;
3656     delete fCB;
3657     delete fSP;
3658     delete fB2;
3659     delete fBA;
3660     delete fBB;
3661     delete fHY;
3662     delete fH2;
3663     delete fH3;
3664     delete fCL;
3665     delete fCP;
3666     delete fEX;
3667     delete fIN;
3668     delete fJL;
3669     delete fJV;
3670     delete fJT;
3671     delete fNS;
3672     delete fOP;
3673     delete fQU;
3674     delete fIS;
3675     delete fNU;
3676     delete fPO;
3677     delete fPR;
3678     delete fSY;
3679     delete fAI;
3680     delete fAL;
3681     delete fCJ;
3682     delete fHL;
3683     delete fID;
3684     delete fRI;
3685     delete fSG;
3686     delete fXX;
3687     delete fEB;
3688     delete fEM;
3689     delete fZJ;
3690     delete fExtendedPict;
3691     delete fEmojiNRK;
3692 
3693     delete fCharBI;
3694     delete fNumberMatcher;
3695 }
3696 
3697 
3698 //-------------------------------------------------------------------------------------------
3699 //
3700 //   TestMonkey
3701 //
3702 //     params
3703 //       seed=nnnnn        Random number starting seed.
3704 //                         Setting the seed allows errors to be reproduced.
3705 //       loop=nnn          Looping count.  Controls running time.
3706 //                         -1:  run forever.
3707 //                          0 or greater:  run length.
3708 //
3709 //       type = char | word | line | sent | title
3710 //
3711 //  Example:
3712 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3713 //
3714 //-------------------------------------------------------------------------------------------
3715 
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3716 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3717     int32_t val = defaultVal;
3718     name.append(" *= *(-?\\d+)");
3719     UErrorCode status = U_ZERO_ERROR;
3720     RegexMatcher m(name, params, 0, status);
3721     if (m.find()) {
3722         // The param exists.  Convert the string to an int.
3723         char valString[100];
3724         int32_t paramLength = m.end(1, status) - m.start(1, status);
3725         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3726             paramLength = (int32_t)(sizeof(valString)-2);
3727         }
3728         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3729         val = strtol(valString,  NULL, 10);
3730 
3731         // Delete this parameter from the params string.
3732         m.reset();
3733         params = m.replaceFirst("", status);
3734     }
3735     U_ASSERT(U_SUCCESS(status));
3736     return val;
3737 }
3738 #endif
3739 
3740 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3741 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3742                                     BreakIterator *bi,
3743                                     int expected[],
3744                                     int expectedcount)
3745 {
3746     int count = 0;
3747     int i = 0;
3748     int forward[50];
3749     bi->setText(ustr);
3750     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3751         forward[count] = i;
3752         if (count < expectedcount && expected[count] != i) {
3753             test->errln("break forward test failed: expected %d but got %d",
3754                         expected[count], i);
3755             break;
3756         }
3757         count ++;
3758     }
3759     if (count != expectedcount) {
3760         printStringBreaks(ustr, expected, expectedcount);
3761         test->errln("break forward test failed: missed %d match",
3762                     expectedcount - count);
3763         return;
3764     }
3765     // testing boundaries
3766     for (i = 1; i < expectedcount; i ++) {
3767         int j = expected[i - 1];
3768         if (!bi->isBoundary(j)) {
3769             printStringBreaks(ustr, expected, expectedcount);
3770             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3771             return;
3772         }
3773         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3774             if (bi->isBoundary(j)) {
3775                 printStringBreaks(ustr, expected, expectedcount);
3776                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3777                 return;
3778             }
3779         }
3780     }
3781 
3782     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3783         count --;
3784         if (forward[count] != i) {
3785             printStringBreaks(ustr, expected, expectedcount);
3786             test->errln("happy break test previous() failed: expected %d but got %d",
3787                         forward[count], i);
3788             break;
3789         }
3790     }
3791     if (count != 0) {
3792         printStringBreaks(ustr, expected, expectedcount);
3793         test->errln("break test previous() failed: missed a match");
3794         return;
3795     }
3796 
3797     // testing preceding
3798     for (i = 0; i < expectedcount - 1; i ++) {
3799         // int j = expected[i] + 1;
3800         int j = ustr.moveIndex32(expected[i], 1);
3801         for (; j <= expected[i + 1]; j ++) {
3802             if (bi->preceding(j) != expected[i]) {
3803                 printStringBreaks(ustr, expected, expectedcount);
3804                 test->errln("preceding(): Not expecting boundary at position %d", j);
3805                 return;
3806             }
3807         }
3808     }
3809 }
3810 #endif
3811 
TestWordBreaks(void)3812 void RBBITest::TestWordBreaks(void)
3813 {
3814 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3815 
3816     Locale        locale("en");
3817     UErrorCode    status = U_ZERO_ERROR;
3818     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3819     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3820     // Replaced any C+J characters in a row with a random sequence of characters
3821     // of the same length to make our C+J segmentation not get in the way.
3822     static const char *strlist[] =
3823     {
3824     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3825     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3826     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3827     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3828     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3829     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3830     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3831     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3832     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3833     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3834     "\\u2027\\U000e0067\\u0a47\\u00b7",
3835     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3836     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3837     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3838     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3839     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3840     "\\u0027\\u11af\\U000e0057\\u0602",
3841     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3842     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3843     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3844     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3845     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3846     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3847     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3848     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3849     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3850     "\\u18f4\\U000e0049\\u20e7\\u2027",
3851     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3852     "\\ua183\\u102d\\u0bec\\u003a",
3853     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3854     "\\u003a\\u0e57\\u0fad\\u002e",
3855     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3856     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3857     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3858     "\\u003a\\u0664\\u00b7\\u1fba",
3859     "\\u003b\\u0027\\u00b7\\u47a3",
3860     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3861     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3862     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3863     };
3864     int loop;
3865     if (U_FAILURE(status)) {
3866         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3867         return;
3868     }
3869     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3870         // printf("looping %d\n", loop);
3871         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3872         // RBBICharMonkey monkey;
3873         RBBIWordMonkey monkey;
3874 
3875         int expected[50];
3876         int expectedcount = 0;
3877 
3878         monkey.setText(ustr);
3879         int i;
3880         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3881             expected[expectedcount ++] = i;
3882         }
3883 
3884         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3885     }
3886     delete bi;
3887 #endif
3888 }
3889 
TestWordBoundary(void)3890 void RBBITest::TestWordBoundary(void)
3891 {
3892     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3893     Locale        locale("en");
3894     UErrorCode    status = U_ZERO_ERROR;
3895     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3896     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3897     UChar         str[50];
3898     static const char *strlist[] =
3899     {
3900     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3901     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3902     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3903     "\\u2027\\U000e0067\\u0a47\\u00b7",
3904     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3905     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3906     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3907     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3908     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3909     "\\u0027\\u11af\\U000e0057\\u0602",
3910     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3911     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3912     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3913     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3914     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3915     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3916     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3917     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3918     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3919     "\\u58f4\\U000e0049\\u20e7\\u2027",
3920     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3921     "\\ua183\\u102d\\u0bec\\u003a",
3922     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3923     "\\u003a\\u0e57\\u0fad\\u002e",
3924     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3925     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3926     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3927     "\\u003a\\u0664\\u00b7\\u1fba",
3928     "\\u003b\\u0027\\u00b7\\u47a3",
3929     };
3930     int loop;
3931     if (U_FAILURE(status)) {
3932         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3933         return;
3934     }
3935     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3936         // printf("looping %d\n", loop);
3937         u_unescape(strlist[loop], str, 20);
3938         UnicodeString ustr(str);
3939         int forward[50];
3940         int count = 0;
3941 
3942         bi->setText(ustr);
3943         int prev = 0;
3944         int i;
3945         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3946             forward[count ++] = i;
3947             if (i > prev) {
3948                 int j;
3949                 for (j = prev + 1; j < i; j ++) {
3950                     if (bi->isBoundary(j)) {
3951                         printStringBreaks(ustr, forward, count);
3952                         errln("happy boundary test failed: expected %d not a boundary",
3953                                j);
3954                         return;
3955                     }
3956                 }
3957             }
3958             if (!bi->isBoundary(i)) {
3959                 printStringBreaks(ustr, forward, count);
3960                 errln("happy boundary test failed: expected %d a boundary",
3961                        i);
3962                 return;
3963             }
3964             prev = i;
3965         }
3966     }
3967     delete bi;
3968 }
3969 
TestLineBreaks(void)3970 void RBBITest::TestLineBreaks(void)
3971 {
3972 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3973     Locale        locale("en");
3974     UErrorCode    status = U_ZERO_ERROR;
3975     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3976     const int32_t  STRSIZE = 50;
3977     UChar         str[STRSIZE];
3978     static const char *strlist[] =
3979     {
3980      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3981      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3982              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3983      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3984              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3985      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3986      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3987      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3988      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3989      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3990      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3991      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3992      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3993      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3994      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3995      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3996      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3997      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3998      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3999      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4000      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4001      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4002      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4003      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4004      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4005      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4006      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4007      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4008      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4009      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4010      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4011      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4012      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4013      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4014      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4015      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4016      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4017      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4018      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4019          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4020     };
4021     int loop;
4022     TEST_ASSERT_SUCCESS(status);
4023     if (U_FAILURE(status)) {
4024         return;
4025     }
4026     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4027         // printf("looping %d\n", loop);
4028         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4029         if (t >= STRSIZE) {
4030             TEST_ASSERT(FALSE);
4031             continue;
4032         }
4033 
4034 
4035         UnicodeString ustr(str);
4036         RBBILineMonkey monkey;
4037         if (U_FAILURE(monkey.deferredStatus)) {
4038             continue;
4039         }
4040 
4041         const int EXPECTEDSIZE = 50;
4042         int expected[EXPECTEDSIZE];
4043         int expectedcount = 0;
4044 
4045         monkey.setText(ustr);
4046         int i;
4047         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4048             if (expectedcount >= EXPECTEDSIZE) {
4049                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4050                 return;
4051             }
4052             expected[expectedcount ++] = i;
4053         }
4054 
4055         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4056     }
4057     delete bi;
4058 #endif
4059 }
4060 
TestSentBreaks(void)4061 void RBBITest::TestSentBreaks(void)
4062 {
4063 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4064     Locale        locale("en");
4065     UErrorCode    status = U_ZERO_ERROR;
4066     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4067     UChar         str[200];
4068     static const char *strlist[] =
4069     {
4070      "Now\ris\nthe\r\ntime\n\rfor\r\r",
4071      "This\n",
4072      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4073      "\"Sentence ending with a quote.\" Bye.",
4074      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
4075      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4076      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4077      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4078      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4079      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4080      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4081              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4082              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4083              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4084      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4085              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4086              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4087              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4088              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4089              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4090     };
4091     int loop;
4092     if (U_FAILURE(status)) {
4093         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4094         return;
4095     }
4096     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4097         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
4098         UnicodeString ustr(str);
4099 
4100         RBBISentMonkey monkey;
4101         if (U_FAILURE(monkey.deferredStatus)) {
4102             continue;
4103         }
4104 
4105         const int EXPECTEDSIZE = 50;
4106         int expected[EXPECTEDSIZE];
4107         int expectedcount = 0;
4108 
4109         monkey.setText(ustr);
4110         int i;
4111         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4112             if (expectedcount >= EXPECTEDSIZE) {
4113                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4114                 return;
4115             }
4116             expected[expectedcount ++] = i;
4117         }
4118 
4119         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4120     }
4121     delete bi;
4122 #endif
4123 }
4124 
TestMonkey()4125 void RBBITest::TestMonkey() {
4126 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4127 
4128     UErrorCode     status    = U_ZERO_ERROR;
4129     int32_t        loopCount = 500;
4130     int32_t        seed      = 1;
4131     UnicodeString  breakType = "all";
4132     Locale         locale("en");
4133     UBool          useUText  = FALSE;
4134 
4135     if (quick == FALSE) {
4136         loopCount = 10000;
4137     }
4138 
4139     if (fTestParams) {
4140         UnicodeString p(fTestParams);
4141         loopCount = getIntParam("loop", p, loopCount);
4142         seed      = getIntParam("seed", p, seed);
4143 
4144         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4145         if (m.find()) {
4146             breakType = m.group(1, status);
4147             m.reset();
4148             p = m.replaceFirst("", status);
4149         }
4150 
4151         RegexMatcher u(" *utext", p, 0, status);
4152         if (u.find()) {
4153             useUText = TRUE;
4154             u.reset();
4155             p = u.replaceFirst("", status);
4156         }
4157 
4158 
4159         // m.reset(p);
4160         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4161             // Each option is stripped out of the option string as it is processed.
4162             // All options have been checked.  The option string should have been completely emptied..
4163             char buf[100];
4164             p.extract(buf, sizeof(buf), NULL, status);
4165             buf[sizeof(buf)-1] = 0;
4166             errln("Unrecognized or extra parameter:  %s\n", buf);
4167             return;
4168         }
4169 
4170     }
4171 
4172     if (breakType == "char" || breakType == "all") {
4173         RBBICharMonkey  m;
4174         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4175         if (U_SUCCESS(status)) {
4176             RunMonkey(bi, m, "char", seed, loopCount, useUText);
4177             if (breakType == "all" && useUText==FALSE) {
4178                 // Also run a quick test with UText when "all" is specified
4179                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4180             }
4181         }
4182         else {
4183             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4184         }
4185         delete bi;
4186     }
4187 
4188     if (breakType == "word" || breakType == "all") {
4189         logln("Word Break Monkey Test");
4190         RBBIWordMonkey  m;
4191         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4192         if (U_SUCCESS(status)) {
4193             RunMonkey(bi, m, "word", seed, loopCount, useUText);
4194         }
4195         else {
4196             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4197         }
4198         delete bi;
4199     }
4200 
4201     if (breakType == "line" || breakType == "all") {
4202         logln("Line Break Monkey Test");
4203         RBBILineMonkey  m;
4204         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4205         if (loopCount >= 10) {
4206             loopCount = loopCount / 5;   // Line break runs slower than the others.
4207         }
4208         if (U_SUCCESS(status)) {
4209             RunMonkey(bi, m, "line", seed, loopCount, useUText);
4210         }
4211         else {
4212             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4213         }
4214         delete bi;
4215     }
4216 
4217     if (breakType == "sent" || breakType == "all"  ) {
4218         logln("Sentence Break Monkey Test");
4219         RBBISentMonkey  m;
4220         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4221         if (loopCount >= 10) {
4222             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4223         }
4224         if (U_SUCCESS(status)) {
4225             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4226         }
4227         else {
4228             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4229         }
4230         delete bi;
4231     }
4232 
4233 #endif
4234 }
4235 
4236 //
4237 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4238 //    Parameters:
4239 //       bi      - the break iterator to use
4240 //       mk      - MonkeyKind, abstraction for obtaining expected results
4241 //       name    - Name of test (char, word, etc.) for use in error messages
4242 //       seed    - Seed for starting random number generator (parameter from user)
4243 //       numIterations
4244 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)4245 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4246                          int32_t numIterations, UBool useUText) {
4247 
4248 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4249 
4250     const int32_t    TESTSTRINGLEN = 500;
4251     UnicodeString    testText;
4252     int32_t          numCharClasses;
4253     UVector          *chClasses;
4254     int              expected[TESTSTRINGLEN*2 + 1];
4255     int              expectedCount = 0;
4256     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4257     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4258     char             reverseBreaks[TESTSTRINGLEN*2+1];
4259     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4260     char             followingBreaks[TESTSTRINGLEN*2+1];
4261     char             precedingBreaks[TESTSTRINGLEN*2+1];
4262     int              i;
4263     int              loopCount = 0;
4264 
4265     m_seed = seed;
4266 
4267     numCharClasses = mk.charClasses()->size();
4268     chClasses      = mk.charClasses();
4269 
4270     // Check for errors that occured during the construction of the MonkeyKind object.
4271     //  Can't report them where they occured because errln() is a method coming from intlTest,
4272     //  and is not visible outside of RBBITest :-(
4273     if (U_FAILURE(mk.deferredStatus)) {
4274         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4275         return;
4276     }
4277 
4278     // Verify that the character classes all have at least one member.
4279     for (i=0; i<numCharClasses; i++) {
4280         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4281         if (s == NULL || s->size() == 0) {
4282             errln("Character Class #%d is null or of zero size.", i);
4283             return;
4284         }
4285     }
4286 
4287     while (loopCount < numIterations || numIterations == -1) {
4288         if (numIterations == -1 && loopCount % 10 == 0) {
4289             // If test is running in an infinite loop, display a periodic tic so
4290             //   we can tell that it is making progress.
4291             fprintf(stderr, ".");
4292         }
4293         // Save current random number seed, so that we can recreate the random numbers
4294         //   for this loop iteration in event of an error.
4295         seed = m_seed;
4296 
4297         // Populate a test string with data.
4298         testText.truncate(0);
4299         for (i=0; i<TESTSTRINGLEN; i++) {
4300             int32_t  aClassNum = m_rand() % numCharClasses;
4301             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4302             int32_t   charIdx = m_rand() % classSet->size();
4303             UChar32   c = classSet->charAt(charIdx);
4304             if (c < 0) {   // TODO:  deal with sets containing strings.
4305                 errln("%s:%d c < 0", __FILE__, __LINE__);
4306                 break;
4307             }
4308             // Do not assemble a supplementary character from randomly generated separate surrogates.
4309             //   (It could be a dictionary character)
4310             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4311                 continue;
4312             }
4313 
4314             testText.append(c);
4315         }
4316 
4317         // Calculate the expected results for this test string.
4318         mk.setText(testText);
4319         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4320         expectedBreaks[0] = 1;
4321         int32_t breakPos = 0;
4322         expectedCount = 0;
4323         for (;;) {
4324             breakPos = mk.next(breakPos);
4325             if (breakPos == -1) {
4326                 break;
4327             }
4328             if (breakPos > testText.length()) {
4329                 errln("breakPos > testText.length()");
4330             }
4331             expectedBreaks[breakPos] = 1;
4332             U_ASSERT(expectedCount<testText.length());
4333             expected[expectedCount ++] = breakPos;
4334             (void)expected;   // Set but not used warning.
4335                               // TODO (andy): check it out.
4336         }
4337 
4338         // Find the break positions using forward iteration
4339         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4340         if (useUText) {
4341             UErrorCode status = U_ZERO_ERROR;
4342             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4343             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4344             bi->setText(testUText, status);
4345             TEST_ASSERT_SUCCESS(status);
4346             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4347                                       //  This UText can be closed immediately, so long as the
4348                                       //  testText string continues to exist.
4349         } else {
4350             bi->setText(testText);
4351         }
4352 
4353         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4354             if (i < 0 || i > testText.length()) {
4355                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4356                 break;
4357             }
4358             forwardBreaks[i] = 1;
4359         }
4360 
4361         // Find the break positions using reverse iteration
4362         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4363         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4364             if (i < 0 || i > testText.length()) {
4365                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4366                 break;
4367             }
4368             reverseBreaks[i] = 1;
4369         }
4370 
4371         // Find the break positions using isBoundary() tests.
4372         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4373         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4374         for (i=0; i<=testText.length(); i++) {
4375             isBoundaryBreaks[i] = bi->isBoundary(i);
4376         }
4377 
4378 
4379         // Find the break positions using the following() function.
4380         // printf(".");
4381         memset(followingBreaks, 0, sizeof(followingBreaks));
4382         int32_t   lastBreakPos = 0;
4383         followingBreaks[0] = 1;
4384         for (i=0; i<testText.length(); i++) {
4385             breakPos = bi->following(i);
4386             if (breakPos <= i ||
4387                 breakPos < lastBreakPos ||
4388                 breakPos > testText.length() ||
4389                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4390                 errln("%s break monkey test: "
4391                     "Out of range value returned by BreakIterator::following().\n"
4392                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4393                          name, seed, i, breakPos, lastBreakPos);
4394                 break;
4395             }
4396             followingBreaks[breakPos] = 1;
4397             lastBreakPos = breakPos;
4398         }
4399 
4400         // Find the break positions using the preceding() function.
4401         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4402         lastBreakPos = testText.length();
4403         precedingBreaks[testText.length()] = 1;
4404         for (i=testText.length(); i>0; i--) {
4405             breakPos = bi->preceding(i);
4406             if (breakPos >= i ||
4407                 breakPos > lastBreakPos ||
4408                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4409                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4410                 errln("%s break monkey test: "
4411                     "Out of range value returned by BreakIterator::preceding().\n"
4412                     "index=%d;  prev returned %d; lastBreak=%d" ,
4413                     name,  i, breakPos, lastBreakPos);
4414                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4415                     precedingBreaks[i] = 2;   // Forces an error.
4416                 }
4417             } else {
4418                 if (breakPos >= 0) {
4419                     precedingBreaks[breakPos] = 1;
4420                 }
4421                 lastBreakPos = breakPos;
4422             }
4423         }
4424 
4425         // Compare the expected and actual results.
4426         for (i=0; i<=testText.length(); i++) {
4427             const char *errorType = NULL;
4428             if  (forwardBreaks[i] != expectedBreaks[i]) {
4429                 errorType = "next()";
4430             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4431                 errorType = "previous()";
4432             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4433                 errorType = "isBoundary()";
4434             } else if (followingBreaks[i] != expectedBreaks[i]) {
4435                 errorType = "following()";
4436             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4437                 errorType = "preceding()";
4438             }
4439 
4440 
4441             if (errorType != NULL) {
4442                 // Format a range of the test text that includes the failure as
4443                 //  a data item that can be included in the rbbi test data file.
4444 
4445                 // Start of the range is the last point where expected and actual results
4446                 //   both agreed that there was a break position.
4447                 int startContext = i;
4448                 int32_t count = 0;
4449                 for (;;) {
4450                     if (startContext==0) { break; }
4451                     startContext --;
4452                     if (expectedBreaks[startContext] != 0) {
4453                         if (count == 2) break;
4454                         count ++;
4455                     }
4456                 }
4457 
4458                 // End of range is two expected breaks past the start position.
4459                 int endContext = i + 1;
4460                 int ci;
4461                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4462                     for (;;) {
4463                         if (endContext >= testText.length()) {break;}
4464                         if (expectedBreaks[endContext-1] != 0) {
4465                             if (count == 0) break;
4466                             count --;
4467                         }
4468                         endContext ++;
4469                     }
4470                 }
4471 
4472                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4473                 UnicodeString errorText = "<data>";
4474                 /***if (strcmp(errorType, "next()") == 0) {
4475                     startContext = 0;
4476                     endContext = testText.length();
4477 
4478                     printStringBreaks(testText, expected, expectedCount);
4479                 }***/
4480 
4481                 for (ci=startContext; ci<endContext;) {
4482                     UnicodeString hexChars("0123456789abcdef");
4483                     UChar32  c;
4484                     int      bn;
4485                     c = testText.char32At(ci);
4486                     if (ci == i) {
4487                         // This is the location of the error.
4488                         errorText.append("<?>");
4489                     } else if (expectedBreaks[ci] != 0) {
4490                         // This a non-error expected break position.
4491                         errorText.append("\\");
4492                     }
4493                     if (c < 0x10000) {
4494                         errorText.append("\\u");
4495                         for (bn=12; bn>=0; bn-=4) {
4496                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4497                         }
4498                     } else {
4499                         errorText.append("\\U");
4500                         for (bn=28; bn>=0; bn-=4) {
4501                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4502                         }
4503                     }
4504                     ci = testText.moveIndex32(ci, 1);
4505                 }
4506                 errorText.append("\\");
4507                 errorText.append("</data>\n");
4508 
4509                 // Output the error
4510                 char  charErrorTxt[500];
4511                 UErrorCode status = U_ZERO_ERROR;
4512                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4513                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4514                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4515 
4516                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4517                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4518                     errorType, seed, i, charErrorTxt);
4519                 break;
4520             }
4521         }
4522 
4523         loopCount++;
4524     }
4525 #endif
4526 }
4527 
4528 
4529 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4530 //             This test checks the initial patch,
4531 //             which is to just keep it from crashing.  Correct word boundaries
4532 //             await a proper fix to the dictionary code.
4533 //
TestBug5532(void)4534 void RBBITest::TestBug5532(void)  {
4535    // Text includes a mixture of Thai and Latin.
4536    const unsigned char utf8Data[] = {
4537            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4538            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4539            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4540            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4541            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4542            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4543            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4544            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4545            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4546            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4547            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4548 
4549     UErrorCode status = U_ZERO_ERROR;
4550     UText utext=UTEXT_INITIALIZER;
4551     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4552     TEST_ASSERT_SUCCESS(status);
4553 
4554     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4555     TEST_ASSERT_SUCCESS(status);
4556     if (U_SUCCESS(status)) {
4557         bi->setText(&utext, status);
4558         TEST_ASSERT_SUCCESS(status);
4559 
4560         int32_t breakCount = 0;
4561         int32_t previousBreak = -1;
4562         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4563             // For now, just make sure that the break iterator doesn't hang.
4564             TEST_ASSERT(previousBreak < bi->current());
4565             previousBreak = bi->current();
4566         }
4567         TEST_ASSERT(breakCount > 0);
4568     }
4569     delete bi;
4570     utext_close(&utext);
4571 }
4572 
4573 
TestBug9983(void)4574 void RBBITest::TestBug9983(void)  {
4575     UnicodeString text = UnicodeString("\\u002A"  // * Other
4576                                        "\\uFF65"  //   Other
4577                                        "\\u309C"  //   Katakana
4578                                        "\\uFF9F"  //   Extend
4579                                        "\\uFF65"  //   Other
4580                                        "\\u0020"  //   Other
4581                                        "\\u0000").unescape();
4582 
4583     UErrorCode status = U_ZERO_ERROR;
4584     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4585         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4586     TEST_ASSERT_SUCCESS(status);
4587     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4588         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4589     TEST_ASSERT_SUCCESS(status);
4590     if (U_FAILURE(status)) {
4591         return;
4592     }
4593     int32_t offset, rstatus, iterationCount;
4594 
4595     brkiter->setText(text);
4596     brkiter->last();
4597     iterationCount = 0;
4598     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4599         iterationCount++;
4600         rstatus = brkiter->getRuleStatus();
4601         (void)rstatus;     // Suppress set but not used warning.
4602         if (iterationCount >= 10) {
4603            break;
4604         }
4605     }
4606     TEST_ASSERT(iterationCount == 6);
4607 
4608     brkiterPOSIX->setText(text);
4609     brkiterPOSIX->last();
4610     iterationCount = 0;
4611     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4612         iterationCount++;
4613         rstatus = brkiterPOSIX->getRuleStatus();
4614         (void)rstatus;     // Suppress set but not used warning.
4615         if (iterationCount >= 10) {
4616            break;
4617         }
4618     }
4619     TEST_ASSERT(iterationCount == 6);
4620 }
4621 
4622 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4623 //
TestBug7547()4624 void RBBITest::TestBug7547() {
4625     UnicodeString rules;
4626     UErrorCode status = U_ZERO_ERROR;
4627     UParseError parseError;
4628     RuleBasedBreakIterator breakIterator(rules, parseError, status);
4629     if (status != U_BRK_RULE_SYNTAX) {
4630         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4631     }
4632     if (parseError.line != 1 || parseError.offset != 0) {
4633         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4634     }
4635 }
4636 
4637 
TestBug12797()4638 void RBBITest::TestBug12797() {
4639     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4640     UErrorCode status = U_ZERO_ERROR;
4641     UParseError parseError;
4642     RuleBasedBreakIterator bi(rules, parseError, status);
4643     if (U_FAILURE(status)) {
4644         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4645         return;
4646     }
4647     UnicodeString text = "abc";
4648     bi.setText(text);
4649     bi.first();
4650     int32_t boundary = bi.next();
4651     if (boundary != 3) {
4652         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4653     }
4654 }
4655 
TestBug12918()4656 void RBBITest::TestBug12918() {
4657     // This test triggers an assertion failure in dictbe.cpp
4658     const UChar crasherString[] = { 0x3325, 0x4a16, 0 };
4659     UErrorCode status = U_ZERO_ERROR;
4660     UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4661     if (U_FAILURE(status)) {
4662         errln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4663         return;
4664     }
4665     ubrk_first(iter);
4666     int32_t pos = 0;
4667     int32_t lastPos = -1;
4668     while((pos = ubrk_next(iter)) != UBRK_DONE) {
4669         if (pos <= lastPos) {
4670             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4671             break;
4672         }
4673     }
4674     ubrk_close(iter);
4675 }
4676 
4677 //
4678 //  TestDebug    -  A place-holder test for debugging purposes.
4679 //                  For putting in fragments of other tests that can be invoked
4680 //                  for tracing  without a lot of unwanted extra stuff happening.
4681 //
TestDebug(void)4682 void RBBITest::TestDebug(void) {
4683 
4684 }
4685 
TestProperties()4686 void RBBITest::TestProperties() {
4687     UErrorCode errorCode = U_ZERO_ERROR;
4688     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4689     if (!prependSet.isEmpty()) {
4690         errln(
4691             "[:GCB=Prepend:] is not empty any more. "
4692             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4693             "change this test to the opposite condition.");
4694     }
4695 }
4696 
4697 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
4698